diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -3947,6 +3947,19 @@ /// because TrailingObjects cannot handle repeated types. struct ExceptionType { QualType Type; }; + /// The AArch64 SME ACLE (Arm C/C++ Language Extensions) define a number + /// of function type attributes that can be set on function types, including + /// function pointers. + enum AArch64SMETypeAttributes : unsigned { + SME_NormalFunction = 0, + SME_PStateSMEnabledMask = 1 << 0, + SME_PStateSMCompatibleMask = 1 << 1, + SME_PStateZASharedMask = 1 << 2, + SME_PStateZAPreservedMask = 1 << 3, + SME_AttributeMask = 255 // We only support maximum 8 bits because of the + // bitmask in FunctionTypeExtraBitfields. + }; + /// A simple holder for various uncommon bits which do not fit in /// FunctionTypeBitfields. Aligned to alignof(void *) to maintain the /// alignment of subsequent objects in TrailingObjects. @@ -3955,6 +3968,13 @@ /// A whole unsigned is not needed here and according to /// [implimits] 8 bits would be enough here. unsigned NumExceptionType = 0; + + /// Any AArch64 SME ACLE type attributes that need to be propagated + /// on declarations and function pointers. + unsigned AArch64SMEAttributes : 8; + + FunctionTypeExtraBitfields() + : AArch64SMEAttributes(SME_NormalFunction) {} }; protected: @@ -4135,16 +4155,20 @@ FunctionType::ExtInfo ExtInfo; bool Variadic : 1; bool HasTrailingReturn : 1; + unsigned AArch64SMEAttributes : 8; Qualifiers TypeQuals; RefQualifierKind RefQualifier = RQ_None; ExceptionSpecInfo ExceptionSpec; const ExtParameterInfo *ExtParameterInfos = nullptr; SourceLocation EllipsisLoc; - ExtProtoInfo() : Variadic(false), HasTrailingReturn(false) {} + ExtProtoInfo() + : Variadic(false), HasTrailingReturn(false), + AArch64SMEAttributes(SME_NormalFunction) {} ExtProtoInfo(CallingConv CC) - : ExtInfo(CC), Variadic(false), HasTrailingReturn(false) {} + : ExtInfo(CC), Variadic(false), HasTrailingReturn(false), + AArch64SMEAttributes(SME_NormalFunction) {} ExtProtoInfo withExceptionSpec(const ExceptionSpecInfo &ESI) { ExtProtoInfo Result(*this); @@ -4153,7 +4177,16 @@ } bool requiresFunctionProtoTypeExtraBitfields() const { - return ExceptionSpec.Type == EST_Dynamic; + return ExceptionSpec.Type == EST_Dynamic || + AArch64SMEAttributes != SME_NormalFunction; + } + + void setArmSMEAttribute(AArch64SMETypeAttributes Kind, bool Enable = true) { + if (Enable) + AArch64SMEAttributes |= Kind; + else + AArch64SMEAttributes = + (AArch64SMEAttributes ^ Kind) & AArch64SMEAttributes; } }; @@ -4280,6 +4313,7 @@ EPI.TypeQuals = getMethodQuals(); EPI.RefQualifier = getRefQualifier(); EPI.ExtParameterInfos = getExtParameterInfosOrNull(); + EPI.AArch64SMEAttributes = getAArch64SMEAttributes(); return EPI; } @@ -4461,6 +4495,15 @@ return getTrailingObjects(); } + /// Return a bitmask describing the SME attributes on the function type, see + /// AArch64SMETypeAttributes for their values. + unsigned getAArch64SMEAttributes() const { + if (!hasExtraBitfields()) + return SME_NormalFunction; + return getTrailingObjects()->AArch64SMEAttributes; + } + + ExtParameterInfo getExtParameterInfo(unsigned I) const { assert(I < getNumParams() && "parameter index out of range"); if (hasExtParameterInfos()) diff --git a/clang/include/clang/AST/TypeProperties.td b/clang/include/clang/AST/TypeProperties.td --- a/clang/include/clang/AST/TypeProperties.td +++ b/clang/include/clang/AST/TypeProperties.td @@ -323,6 +323,9 @@ ? node->getExtParameterInfos() : llvm::ArrayRef() }]; } + def : Property<"AArch64SMEAttributes", UInt32> { + let Read = [{ node->getAArch64SMEAttributes() }]; + } def : Creator<[{ auto extInfo = FunctionType::ExtInfo(noReturn, hasRegParm, regParm, @@ -338,6 +341,7 @@ epi.ExceptionSpec = exceptionSpecifier; epi.ExtParameterInfos = extParameterInfo.empty() ? nullptr : extParameterInfo.data(); + epi.AArch64SMEAttributes = AArch64SMEAttributes; return ctx.getFunctionType(returnType, parameters, epi); }]>; } diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -403,6 +403,9 @@ } def TargetARM : TargetArch<["arm", "thumb", "armeb", "thumbeb"]>; def TargetAArch64 : TargetArch<["aarch64"]>; +def TargetAArch64SME : TargetArch<["aarch64"]> { + let CustomCode = [{ Target.hasFeature("sme") }]; +} def TargetAnyArm : TargetArch; def TargetAVR : TargetArch<["avr"]>; def TargetBPF : TargetArch<["bpfel", "bpfeb"]>; @@ -2411,6 +2414,44 @@ let Documentation = [AArch64VectorPcsDocs]; } +def ArmStreamingCompatible : TypeAttr, TargetSpecificAttr { + let Spellings = [GNU<"arm_streaming_compatible">]; + let Subjects = SubjectList<[HasFunctionProto], ErrorDiag>; + let Documentation = [ArmSmeStreamingCompatibleDocs]; +} + +def ArmStreaming : TypeAttr, TargetSpecificAttr { + let Spellings = [GNU<"arm_streaming">]; + let Subjects = SubjectList<[HasFunctionProto], ErrorDiag>; + let Documentation = [ArmSmeStreamingDocs]; +} + +def ArmLocallyStreaming : InheritableAttr, TargetSpecificAttr { + let Spellings = [GNU<"arm_locally_streaming">]; + let Subjects = SubjectList<[Function], ErrorDiag>; + let Documentation = [ArmSmeLocallyStreamingDocs]; +} + +def ArmSharedZA : TypeAttr, TargetSpecificAttr { + let Spellings = [GNU<"arm_shared_za">]; + let Subjects = SubjectList<[HasFunctionProto], ErrorDiag>; + let Documentation = [ArmSmeSharedZADocs]; +} + +def ArmPreservesZA : TypeAttr, TargetSpecificAttr { + let Spellings = [GNU<"arm_preserves_za">]; + let Subjects = SubjectList<[HasFunctionProto], ErrorDiag>; + let Documentation = [ArmSmePreservesZADocs]; +} + +def ArmNewZA : InheritableAttr, TargetSpecificAttr { + let Spellings = [GNU<"arm_new_za">]; + let Subjects = SubjectList<[Function], ErrorDiag>; + let Documentation = [ArmSmeNewZADocs]; +} +def : MutualExclusions<[ArmNewZA, ArmSharedZA]>; +def : MutualExclusions<[ArmNewZA, ArmPreservesZA]>; + def AArch64SVEPcs: DeclOrTypeAttr { let Spellings = [Clang<"aarch64_sve_pcs">]; let Documentation = [AArch64SVEPcsDocs]; diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -6484,6 +6484,91 @@ }]; } +def DocCatACLE_SME : DocumentationCategory<"ARM C Language Extensions for SME"> { + let Content = [{ +Clang implements builtins and function attributes as defined in the `Arm C +Language Extensions for SME `_. + +Only when Clang defines __ARM_FEATURE_SME, the support for this feature is +considered to be stable and complete. +}]; +} + +def ArmSmeStreamingDocs : Documentation { + let Category = DocCatACLE_SME; + let Content = [{ +The ``arm_streaming`` attribute is used to mark a function as being a +streaming function, meaning that it is expected to be called with +``PSTATE.SM=1`` and return with ``PSTATE.SM=1``. + +By adding this attribute, Clang may insert the appropriate ``smstart`` and +``smstop`` instructions before and after the call to guarantee that these +conditions are satisfied. + }]; +} + +def ArmSmeLocallyStreamingDocs : Documentation { + let Category = DocCatACLE_SME; + let Content = [{ +The ``arm_locally_streaming`` attribute is used to mark a function's body +as being executed with ``PSTATE.SM=1``. The function's interface is +non-streaming, meaning that it is expected to be called with +``PSTATE.SM=0`` and return with ``PSTATE.SM=0``. + +By adding this attribute, Clang will insert the appropriate ``smstart`` and +``smstop`` instructions in the prologue and epilogue of the function to ensure +the body is executed with ``PSTATE.SM=1``. + }]; +} + +def ArmSmeStreamingCompatibleDocs : Documentation { + let Category = DocCatACLE_SME; + let Content = [{ +The ``arm_streaming_compatible`` attribute is used to mark a function as +being a streaming compatible function for which ``PSTATE.SM`` can either be +``0`` or ``1`` at runtime. + +Clang will try to avoid generating instructions that are not legal in +streaming-compatible mode. These are instructions that are valid +exclusively when ``PSTATE.SM=0`` or instructions that are valid exclusively +when ``PSTATE.SM=1``. + }]; +} + +def ArmSmeSharedZADocs : Documentation { + let Category = DocCatACLE_SME; + let Content = [{ +The ``arm_shared_za`` attribute is used to mark a function as sharing the +state of ZA, the accumulator array, with that of its callers. + +By adding this attribute, callers of this function will know that the contents +of ZA may be used for passing or returning data, and can be modified. Clang may +assume that ``PSTATE.ZA`` is ``1`` and will avoid setting up a lazy-save +mechanism for calls to functions marked as ``arm_shared_za``. + }]; +} + +def ArmSmeNewZADocs : Documentation { + let Category = DocCatACLE_SME; + let Content = [{ +The ``arm_new_za`` attribute is used to mark a function as a private ZA +function that requires a new state for ZA. + +By adding this attribute, Clang emits the appropriate ``smstart`` instruction to +allow the use of ZA and will additionally commit a lazy-save if the state of ZA +is dormant. It also emits the appropriate ``smstop`` in the function's epilogue. + }]; +} + +def ArmSmePreservesZADocs : Documentation { + let Category = DocCatACLE_SME; + let Content = [{ +When a function is marked as ``arm_preserves_za``, it is a hint to +the compiler that the function and any of it's callees will preserve the state +of ZA. + }]; +} + def ArmMveStrictPolymorphismDocs : Documentation { let Category = DocCatType; let Content = [{ diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def --- a/clang/include/clang/Basic/Builtins.def +++ b/clang/include/clang/Basic/Builtins.def @@ -39,6 +39,7 @@ // A -> "reference" to __builtin_va_list // V -> Vector, followed by the number of elements and the base type. // q -> Scalable vector, followed by the number of elements and the base type. +// Q -> AArch64 svcount_t builtin type. // E -> ext_vector, followed by the number of elements and the base type. // X -> _Complex, followed by the base type. // Y -> ptrdiff_t diff --git a/clang/include/clang/Basic/BuiltinsAArch64.def b/clang/include/clang/Basic/BuiltinsAArch64.def --- a/clang/include/clang/Basic/BuiltinsAArch64.def +++ b/clang/include/clang/Basic/BuiltinsAArch64.def @@ -66,6 +66,9 @@ TARGET_BUILTIN(__builtin_arm_stg, "vv*", "t", "mte") TARGET_BUILTIN(__builtin_arm_subp, "Uiv*v*", "t", "mte") +// SME state function +BUILTIN(__builtin_arm_get_sme_state, "vULi*ULi*", "n") + // Memory Operations TARGET_BUILTIN(__builtin_arm_mops_memset_tag, "v*v*iz", "", "mte,mops") diff --git a/clang/include/clang/Basic/BuiltinsSVE.def b/clang/include/clang/Basic/BuiltinsSME.def copy from clang/include/clang/Basic/BuiltinsSVE.def copy to clang/include/clang/Basic/BuiltinsSME.def --- a/clang/include/clang/Basic/BuiltinsSVE.def +++ b/clang/include/clang/Basic/BuiltinsSME.def @@ -1,4 +1,4 @@ -//===--- BuiltinsSVE.def - SVE Builtin function database --------*- C++ -*-===// +//===--- BuiltinsSME.def - SVE Builtin function database --------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,17 +6,16 @@ // //===----------------------------------------------------------------------===// // -// This file defines the SVE-specific builtin function database. Users of +// This file defines the SME-specific builtin function database. Users of // this file must define the BUILTIN macro to make use of this information. // //===----------------------------------------------------------------------===// // The format of this database matches clang/Basic/Builtins.def. -#define GET_SVE_BUILTINS -#include "clang/Basic/arm_sve_builtins.inc" -#include "clang/Basic/BuiltinsAArch64NeonSVEBridge.def" -#undef GET_SVE_BUILTINS +#define GET_SME_BUILTINS +#include "clang/Basic/arm_sme_builtins.inc" +#undef GET_SME_BUILTINS #undef BUILTIN #undef TARGET_BUILTIN diff --git a/clang/include/clang/Basic/BuiltinsSVE.def b/clang/include/clang/Basic/BuiltinsSVE.def --- a/clang/include/clang/Basic/BuiltinsSVE.def +++ b/clang/include/clang/Basic/BuiltinsSVE.def @@ -19,4 +19,3 @@ #undef GET_SVE_BUILTINS #undef BUILTIN -#undef TARGET_BUILTIN diff --git a/clang/include/clang/Basic/CMakeLists.txt b/clang/include/clang/Basic/CMakeLists.txt --- a/clang/include/clang/Basic/CMakeLists.txt +++ b/clang/include/clang/Basic/CMakeLists.txt @@ -72,6 +72,27 @@ clang_tablegen(arm_sve_sema_rangechecks.inc -gen-arm-sve-sema-rangechecks SOURCE arm_sve.td TARGET ClangARMSveSemaRangeChecks) +clang_tablegen(arm_sve_streaming_attrs.inc -gen-arm-sve-streaming-attrs + SOURCE arm_sve.td + TARGET ClangARMSveStreamingAttrs) +clang_tablegen(arm_sme_builtins.inc -gen-arm-sme-builtins + SOURCE arm_sme.td + TARGET ClangARMSmeBuiltins) +clang_tablegen(arm_sme_builtin_cg.inc -gen-arm-sme-builtin-codegen + SOURCE arm_sme.td + TARGET ClangARMSmeBuiltinCG) +clang_tablegen(arm_sme_typeflags.inc -gen-arm-sme-typeflags + SOURCE arm_sme.td + TARGET ClangARMSmeTypeFlags) +clang_tablegen(arm_sme_sema_rangechecks.inc -gen-arm-sme-sema-rangechecks + SOURCE arm_sme.td + TARGET ClangARMSmeSemaRangeChecks) +clang_tablegen(arm_sme_streaming_attrs.inc -gen-arm-sme-streaming-attrs + SOURCE arm_sme.td + TARGET ClangARMSmeStreamingAttrs) +clang_tablegen(arm_sme_builtins_za_state.inc -gen-arm-sme-builtin-za-state + SOURCE arm_sme.td + TARGET ClangARMSmeBuiltinsZAState) clang_tablegen(arm_cde_builtins.inc -gen-arm-cde-builtin-def SOURCE arm_cde.td TARGET ClangARMCdeBuiltinsDef) diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -2012,6 +2012,10 @@ "than the function it overrides}1,2">; def note_overridden_virtual_function : Note< "overridden virtual function is here">; +def err_conflicting_overriding_attributes : Error< + "virtual function %0 has different attributes " + "%diff{($) than the function it overrides (which has $)|" + "than the function it overrides}1,2">; def err_conflicting_overriding_cc_attributes : Error< "virtual function %0 has different calling convention attributes " "%diff{($) than the function it overrides (which has calling convention $)|" @@ -3056,6 +3060,12 @@ def err_attribute_arm_feature_sve_bits_unsupported : Error< "%0 is only supported when '-msve-vector-bits=' is specified with a " "value of 128, 256, 512, 1024 or 2048.">; +def warn_attribute_arm_sm_incompat_builtin : Warning< + "builtin call has undefined behaviour when called from a %0 function">, + InGroup>; +def warn_attribute_arm_za_builtin_no_za_state : Warning< + "builtin call is not valid when calling from a function without active ZA state">, + InGroup>; def err_sve_vector_in_non_sve_target : Error< "SVE vector type %0 cannot be used in a target without sve">; def err_attribute_riscv_rvv_bits_unsupported : Error< @@ -3574,6 +3584,9 @@ "the vecreturn attribute can only be used on a class or structure with one member, which must be a vector">; def err_attribute_vecreturn_only_pod_record : Error< "the vecreturn attribute can only be used on a POD (plain old data) class or structure (i.e. no virtual functions)">; +def err_sme_attr_mismatch : Error< + "function declared '%0' was previously declared '%1'" + " with different SME function attributes">; def err_cconv_change : Error< "function declared '%0' here was previously declared " "%select{'%2'|without calling convention}1">; diff --git a/clang/include/clang/Basic/TargetBuiltins.h b/clang/include/clang/Basic/TargetBuiltins.h --- a/clang/include/clang/Basic/TargetBuiltins.h +++ b/clang/include/clang/Basic/TargetBuiltins.h @@ -53,6 +53,15 @@ }; } + namespace SME { + enum { + LastSVEBuiltin = SVE::FirstTSBuiltin - 1, +#define BUILTIN(ID, TYPE, ATTRS) BI##ID, +#include "clang/Basic/BuiltinsSME.def" + FirstTSBuiltin, + }; + } // namespace SME + /// AArch64 builtins namespace AArch64 { enum { @@ -60,8 +69,10 @@ LastNEONBuiltin = NEON::FirstTSBuiltin - 1, FirstSVEBuiltin = NEON::FirstTSBuiltin, LastSVEBuiltin = SVE::FirstTSBuiltin - 1, - #define BUILTIN(ID, TYPE, ATTRS) BI##ID, - #include "clang/Basic/BuiltinsAArch64.def" + FirstSMEBuiltin = SVE::FirstTSBuiltin, + LastSMEBuiltin = SME::FirstTSBuiltin - 1, +#define BUILTIN(ID, TYPE, ATTRS) BI##ID, +#include "clang/Basic/BuiltinsAArch64.def" LastTSBuiltin }; } @@ -294,6 +305,14 @@ bool isTupleCreate() const { return Flags & IsTupleCreate; } bool isTupleGet() const { return Flags & IsTupleGet; } bool isTupleSet() const { return Flags & IsTupleSet; } + bool isReadZA() const { return Flags & IsReadZA; } + bool isWriteZA() const { return Flags & IsWriteZA; } + bool isZASliceBaseOffsetIntr() const { + return Flags & IsZASliceBaseOffsetIntr; + } + bool isTileBaseOffsetIntr() const { + return Flags & IsTileBaseOffsetIntr; + } uint64_t getBits() const { return Flags; } bool isFlagSet(uint64_t Flag) const { return Flags & Flag; } diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td new file mode 100644 --- /dev/null +++ b/clang/include/clang/Basic/arm_sme.td @@ -0,0 +1,617 @@ +//===--- arm_sme.td - ARM SME compiler interface ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the TableGen definitions from which the ARM SME header +// file will be generated. See: +// +// https://developer.arm.com/architectures/system-architectures/software-standards/acle +// +//===----------------------------------------------------------------------===// + +include "arm_sve_common.td" + +class Inst ft, list ch, MemEltType met = MemEltTyDefault> { + string Name = n; + string Prototype = p; + string Types = t; + string TargetGuard = "sme"; + int Merge = MergeNone.Value; + string MergeSuffix = MergeNone.Suffix; + string LLVMIntrinsic = i; + list Flags = ft; + list ImmChecks = ch; + int MemEltType = met.Value; +} + +// MBaseInst: Memory base instruction. +class MBaseInst f, + MemEltType met, list ch> + : Inst { +} + +// MLInst: Memory load instruction. +class MLInst ch> + : MBaseInst { +} + +// MSInst: Memory store instruction. +class MSInst ch> + : MBaseInst { +} + +multiclass LoadHV ch> { + def NAME # _H : MLInst<"svld1_hor_" # n, p, met, i # "_horiz", ch>; + def NAME # _V : MLInst<"svld1_ver_" # n, p, met, i # "_vert", ch>; +} + +multiclass StoreHV ch> { + def NAME # _H : MSInst<"svst1_hor_" # n, p, met, i # "_horiz", ch>; + def NAME # _V : MSInst<"svst1_ver_" # n, p, met, i # "_vert", ch>; +} + +class ReadInst ch> + : Inst { +} + +class WriteInst ch> + : Inst { +} + +multiclass ReadHV ch> { + def NAME # _H : ReadInst<"svread_hor_" # n, p, t, i # "_horiz", ch>; + def NAME # _V : ReadInst<"svread_ver_" # n, p, t, i # "_vert", ch>; +} + +multiclass WriteHV ch> { + def NAME # _H : WriteInst<"svwrite_hor_" # n, p, t, i # "_horiz", ch>; + def NAME # _V : WriteInst<"svwrite_ver_" # n, p, t, i # "_vert", ch>; +} + +multiclass AddHV ch> { + def NAME # _H : Inst<"svaddha_" # n # "_m", p, t, i # "ha", [IsStreaming, IsSharedZA], ch, MemEltTyDefault>; + def NAME # _V : Inst<"svaddva_" # n # "_m", p, t, i # "va", [IsStreaming, IsSharedZA], ch, MemEltTyDefault>; +} + +multiclass MopAS ch> { + def NAME # _A : Inst<"svmopa_" # n # "_m", "viPPdd", t, "aarch64_sme_" # i # "a" # w, [IsStreaming, IsSharedZA], ch, MemEltTyDefault>; + def NAME # _S : Inst<"svmops_" # n # "_m", "viPPdd", t, "aarch64_sme_" # i # "s" # w, [IsStreaming, IsSharedZA], ch, MemEltTyDefault>; +} + +// == LOADS == + +defm SVLD1_ZA8 : LoadHV<"za8", "vimiPQ", MemEltTyInt8, "aarch64_sme_ld1b", [ImmCheck<0, ImmCheck0>, ImmCheck<2, ImmCheck0_15>]>; +defm SVLD1_ZA16 : LoadHV<"za16", "vimiPQ", MemEltTyInt16, "aarch64_sme_ld1h", [ImmCheck<0, ImmCheck0_1>, ImmCheck<2, ImmCheck0_7>]>; +defm SVLD1_ZA32 : LoadHV<"za32", "vimiPQ", MemEltTyInt32, "aarch64_sme_ld1w", [ImmCheck<0, ImmCheck0_3>, ImmCheck<2, ImmCheck0_3>]>; +defm SVLD1_ZA64 : LoadHV<"za64", "vimiPQ", MemEltTyInt64, "aarch64_sme_ld1d", [ImmCheck<0, ImmCheck0_7>, ImmCheck<2, ImmCheck0_1>]>; +defm SVLD1_ZA128 : LoadHV<"za128", "vimiPQ", MemEltTyInt128, "aarch64_sme_ld1q", [ImmCheck<0, ImmCheck0_15>, ImmCheck<2, ImmCheck0>]>; + +defm SVLD1_VNUM_ZA8 : LoadHV<"vnum_za8", "vimiPQl", MemEltTyInt8, "aarch64_sme_ld1b", [ImmCheck<0, ImmCheck0>, ImmCheck<2, ImmCheck0_15>]>; +defm SVLD1_VNUM_ZA16 : LoadHV<"vnum_za16", "vimiPQl", MemEltTyInt16, "aarch64_sme_ld1h", [ImmCheck<0, ImmCheck0_1>, ImmCheck<2, ImmCheck0_7>]>; +defm SVLD1_VNUM_ZA32 : LoadHV<"vnum_za32", "vimiPQl", MemEltTyInt32, "aarch64_sme_ld1w", [ImmCheck<0, ImmCheck0_3>, ImmCheck<2, ImmCheck0_3>]>; +defm SVLD1_VNUM_ZA64 : LoadHV<"vnum_za64", "vimiPQl", MemEltTyInt64, "aarch64_sme_ld1d", [ImmCheck<0, ImmCheck0_7>, ImmCheck<2, ImmCheck0_1>]>; +defm SVLD1_VNUM_ZA128 : LoadHV<"vnum_za128", "vimiPQl", MemEltTyInt128, "aarch64_sme_ld1q", [ImmCheck<0, ImmCheck0_15>, ImmCheck<2, ImmCheck0>]>; + +def SVLDR_VNUM_ZA : MBaseInst<"svldr_vnum_za", "vmiQ", "aarch64_sme_ldr", + [IsLoad, IsStreamingCompatible, IsSharedZA], + MemEltTyInt8, + [ImmCheck<1, ImmCheck0_15>]>; + +// == STORES == + +defm SVST1_ZA8 : StoreHV<"za8", "vimiP{", MemEltTyInt8, "aarch64_sme_st1b", [ImmCheck<0, ImmCheck0>, ImmCheck<2, ImmCheck0_15>]>; +defm SVST1_ZA16 : StoreHV<"za16", "vimiP{", MemEltTyInt16, "aarch64_sme_st1h", [ImmCheck<0, ImmCheck0_1>, ImmCheck<2, ImmCheck0_7>]>; +defm SVST1_ZA32 : StoreHV<"za32", "vimiP{", MemEltTyInt32, "aarch64_sme_st1w", [ImmCheck<0, ImmCheck0_3>, ImmCheck<2, ImmCheck0_3>]>; +defm SVST1_ZA64 : StoreHV<"za64", "vimiP{", MemEltTyInt64, "aarch64_sme_st1d", [ImmCheck<0, ImmCheck0_7>, ImmCheck<2, ImmCheck0_1>]>; +defm SVST1_ZA128 : StoreHV<"za128", "vimiP{", MemEltTyInt128, "aarch64_sme_st1q", [ImmCheck<0, ImmCheck0_15>, ImmCheck<2, ImmCheck0>]>; + +defm SVST1_VNUM_ZA8 : StoreHV<"vnum_za8", "vimiP{l", MemEltTyInt8, "aarch64_sme_st1b", [ImmCheck<0, ImmCheck0>, ImmCheck<2, ImmCheck0_15>]>; +defm SVST1_VNUM_ZA16 : StoreHV<"vnum_za16", "vimiP{l", MemEltTyInt16, "aarch64_sme_st1h", [ImmCheck<0, ImmCheck0_1>, ImmCheck<2, ImmCheck0_7>]>; +defm SVST1_VNUM_ZA32 : StoreHV<"vnum_za32", "vimiP{l", MemEltTyInt32, "aarch64_sme_st1w", [ImmCheck<0, ImmCheck0_3>, ImmCheck<2, ImmCheck0_3>]>; +defm SVST1_VNUM_ZA64 : StoreHV<"vnum_za64", "vimiP{l", MemEltTyInt64, "aarch64_sme_st1d", [ImmCheck<0, ImmCheck0_7>, ImmCheck<2, ImmCheck0_1>]>; +defm SVST1_VNUM_ZA128 : StoreHV<"vnum_za128", "vimiP{l", MemEltTyInt128, "aarch64_sme_st1q", [ImmCheck<0, ImmCheck0_15>, ImmCheck<2, ImmCheck0>]>; + +def SVSTR_VNUM_ZA : MBaseInst<"svstr_vnum_za", "vmi{", "aarch64_sme_str", + [IsStore, IsStreamingCompatible, IsSharedZA, IsPreservedZA], + MemEltTyInt8, + [ImmCheck<1, ImmCheck0_15>]>; + +// == MOVA/READS == + +defm SVREAD_ZA8 : ReadHV<"za8[_{d}]_m", "ddPimi", "cUc", "aarch64_sme_read", [ImmCheck<2, ImmCheck0>, ImmCheck<4, ImmCheck0_15>]>; +defm SVREAD_ZA16 : ReadHV<"za16[_{d}]_m", "ddPimi", "sUshb", "aarch64_sme_read", [ImmCheck<2, ImmCheck0_1>, ImmCheck<4, ImmCheck0_7>]>; +defm SVREAD_ZA32 : ReadHV<"za32[_{d}]_m", "ddPimi", "iUif", "aarch64_sme_read", [ImmCheck<2, ImmCheck0_3>, ImmCheck<4, ImmCheck0_3>]>; +defm SVREAD_ZA64 : ReadHV<"za64[_{d}]_m", "ddPimi", "lUld", "aarch64_sme_read", [ImmCheck<2, ImmCheck0_7>, ImmCheck<4, ImmCheck0_1>]>; +defm SVREAD_ZA128 : ReadHV<"za128[_{d}]_m", "ddPimi", "cUcsUshbiUiflUld", "aarch64_sme_readq", [ImmCheck<2, ImmCheck0_15>, ImmCheck<4, ImmCheck0>]>; + +// == MOVA/WRITES == + +defm SVWRITE_ZA8 : WriteHV<"za8[_{d}]_m", "vimiPd", "cUc", "aarch64_sme_write", [ImmCheck<0, ImmCheck0>, ImmCheck<2, ImmCheck0_15>]>; +defm SVWRITE_ZA16 : WriteHV<"za16[_{d}]_m", "vimiPd", "sUshb", "aarch64_sme_write", [ImmCheck<0, ImmCheck0_1>, ImmCheck<2, ImmCheck0_7>]>; +defm SVWRITE_ZA32 : WriteHV<"za32[_{d}]_m", "vimiPd", "iUif", "aarch64_sme_write", [ImmCheck<0, ImmCheck0_3>, ImmCheck<2, ImmCheck0_3>]>; +defm SVWRITE_ZA64 : WriteHV<"za64[_{d}]_m", "vimiPd", "lUld", "aarch64_sme_write", [ImmCheck<0, ImmCheck0_7>, ImmCheck<2, ImmCheck0_1>]>; +defm SVWRITE_ZA128 : WriteHV<"za128[_{d}]_m", "vimiPd", "cUcsUshbiUiflUld", "aarch64_sme_writeq", [ImmCheck<0, ImmCheck0_15>, ImmCheck<2, ImmCheck0>]>; + +// == SVCNT == + +def SVCNTSB : Inst<"svcntsb", "nv", "", "aarch64_sme_cntsb", [IsOverloadNone, IsStreamingCompatible, IsPreservedZA], []>; +def SVCNTSH : Inst<"svcntsh", "nv", "", "aarch64_sme_cntsh", [IsOverloadNone, IsStreamingCompatible, IsPreservedZA], []>; +def SVCNTSW : Inst<"svcntsw", "nv", "", "aarch64_sme_cntsw", [IsOverloadNone, IsStreamingCompatible, IsPreservedZA], []>; +def SVCNTSD : Inst<"svcntsd", "nv", "", "aarch64_sme_cntsd", [IsOverloadNone, IsStreamingCompatible, IsPreservedZA], []>; + +// == ZERO == + +def SVZERO_MASK : Inst<"svzero_mask_za", "vi", "", "aarch64_sme_zero", [IsOverloadNone, IsStreamingCompatible, IsSharedZA], [ImmCheck<0, ImmCheck0_255>]>; + +// == ADDHA/ADDVA == + +defm SVADD_ZA32 : AddHV<"za32[_{d}]", "viPPd", "iUi", "aarch64_sme_add", [ImmCheck<0, ImmCheck0_3>]>; +let TargetGuard = "sme-i16i64" in { + defm SVADD_ZA64 : AddHV<"za64[_{d}]", "viPPd", "lUl", "aarch64_sme_add", [ImmCheck<0, ImmCheck0_7>]>; +} + +// == MOPA / MOPS == + +defm SVMOP_WIDE_ZA32_FP : MopAS<"za32[_{d}]", "hb", "mop", "_wide", [ImmCheck<0, ImmCheck0_3>]>; +defm SVMOP_WIDE_ZA32_S8 : MopAS<"za32[_{d}]", "c", "smop", "_wide", [ImmCheck<0, ImmCheck0_3>]>; +defm SVMOP_WIDE_ZA32_U8 : MopAS<"za32[_{d}]", "Uc", "umop", "_wide", [ImmCheck<0, ImmCheck0_3>]>; +defm SVMOP_ZA32_FP : MopAS<"za32[_{d}]", "f", "mop", "", [ImmCheck<0, ImmCheck0_3>]>; + +let TargetGuard = "sme-i16i64" in { + defm SVMOPA_WIDE_ZA64_S16 : MopAS<"za64[_{d}]", "s", "smop", "_wide", [ImmCheck<0, ImmCheck0_7>]>; + defm SVMOPA_WIDE_ZA64_U16 : MopAS<"za64[_{d}]", "Us", "umop", "_wide", [ImmCheck<0, ImmCheck0_7>]>; +} + +let TargetGuard = "sme-f64f64" in { + defm SVMOP_ZA64_FP : MopAS<"za64[_{d}]", "d", "mop", "", [ImmCheck<0, ImmCheck0_7>]>; +} + +// == SUMOPA / SUMOPS / USMOPS / USMOPA == + +def SVSUMOPA_ZA32_S8 : Inst<"svsumopa_za32[_{d}]_m", "viPPdu", "c", "aarch64_sme_sumopa_wide", [IsStreaming, IsSharedZA], [ImmCheck<0, ImmCheck0_3>], MemEltTyDefault>; +def SVSUMOPS_ZA32_S8 : Inst<"svsumops_za32[_{d}]_m", "viPPdu", "c", "aarch64_sme_sumops_wide", [IsStreaming, IsSharedZA], [ImmCheck<0, ImmCheck0_3>], MemEltTyDefault>; + +let TargetGuard = "sme-i16i64" in { + def SVSUMOPA_ZA64_S16 : Inst<"svsumopa_za64[_{d}]_m", "viPPdu", "s", "aarch64_sme_sumopa_wide", [IsStreaming, IsSharedZA], [ImmCheck<0, ImmCheck0_7>], MemEltTyDefault>; + def SVSUMOPS_ZA64_S16 : Inst<"svsumops_za64[_{d}]_m", "viPPdu", "s", "aarch64_sme_sumops_wide", [IsStreaming, IsSharedZA], [ImmCheck<0, ImmCheck0_7>], MemEltTyDefault>; +} + +def SVUSMOPA_ZA32_U8 : Inst<"svusmopa_za32[_{d}]_m", "viPPdx", "Uc", "aarch64_sme_usmopa_wide", [IsStreaming, IsSharedZA], [ImmCheck<0, ImmCheck0_3>], MemEltTyDefault>; +def SVUSMOPS_ZA32_U8 : Inst<"svusmops_za32[_{d}]_m", "viPPdx", "Uc", "aarch64_sme_usmops_wide", [IsStreaming, IsSharedZA], [ImmCheck<0, ImmCheck0_3>], MemEltTyDefault>; + +let TargetGuard = "sme-i16i64" in { + def SVUSMOPA_ZA64_U16 : Inst<"svusmopa_za64[_{d}]_m", "viPPdx", "Us", "aarch64_sme_usmopa_wide", [IsStreaming, IsSharedZA], [ImmCheck<0, ImmCheck0_7>], MemEltTyDefault>; + def SVUSMOPS_ZA64_U16 : Inst<"svusmops_za64[_{d}]_m", "viPPdx", "Us", "aarch64_sme_usmops_wide", [IsStreaming, IsSharedZA], [ImmCheck<0, ImmCheck0_7>], MemEltTyDefault>; +} + +// FMLA/FMLS +let TargetGuard = "sme2" in { + def SVMLA_MULTI_VG1x2_F32 : Inst<"svmla_za32[_{d}]_vg1x2", "vmi22", "f", "aarch64_sme_fmla_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLA_MULTI_VG1x4_F32 : Inst<"svmla_za32[_{d}]_vg1x4", "vmi44", "f", "aarch64_sme_fmla_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLS_MULTI_VG1x2_F32 : Inst<"svmls_za32[_{d}]_vg1x2", "vmi22", "f", "aarch64_sme_fmls_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLS_MULTI_VG1x4_F32 : Inst<"svmls_za32[_{d}]_vg1x4", "vmi44", "f", "aarch64_sme_fmls_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + def SVMLA_SINGLE_VG1x2_F32 : Inst<"svmla[_single]_za32[_{d}]_vg1x2", "vmi2d", "f", "aarch64_sme_fmla_single_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLA_SINGLE_VG1x4_F32 : Inst<"svmla[_single]_za32[_{d}]_vg1x4", "vmi4d", "f", "aarch64_sme_fmla_single_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLS_SINGLE_VG1x2_F32 : Inst<"svmls[_single]_za32[_{d}]_vg1x2", "vmi2d", "f", "aarch64_sme_fmls_single_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLS_SINGLE_VG1x4_F32 : Inst<"svmls[_single]_za32[_{d}]_vg1x4", "vmi4d", "f", "aarch64_sme_fmls_single_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + def SVMLA_LANE_VG1x2_F32 : Inst<"svmla_lane_za32[_{d}]_vg1x2", "vmi2di", "f", "aarch64_sme_fmla_lane_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + def SVMLA_LANE_VG1x4_F32 : Inst<"svmla_lane_za32[_{d}]_vg1x4", "vmi4di", "f", "aarch64_sme_fmla_lane_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + def SVMLS_LANE_VG1x2_F32 : Inst<"svmls_lane_za32[_{d}]_vg1x2", "vmi2di", "f", "aarch64_sme_fmls_lane_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + def SVMLS_LANE_VG1x4_F32 : Inst<"svmls_lane_za32[_{d}]_vg1x4", "vmi4di", "f", "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; +} + +let TargetGuard = "sme2,sme-f64f64" in { + def SVMLA_MULTI_VG1x2_F64 : Inst<"svmla_za64[_{d}]_vg1x2", "vmi22", "d", "aarch64_sme_fmla_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLA_MULTI_VG1x4_F64 : Inst<"svmla_za64[_{d}]_vg1x4", "vmi44", "d", "aarch64_sme_fmla_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLS_MULTI_VG1x2_F64 : Inst<"svmls_za64[_{d}]_vg1x2", "vmi22", "d", "aarch64_sme_fmls_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLS_MULTI_VG1x4_F64 : Inst<"svmls_za64[_{d}]_vg1x4", "vmi44", "d", "aarch64_sme_fmls_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + def SVMLA_SINGLE_VG1x2_F64 : Inst<"svmla[_single]_za64[_{d}]_vg1x2", "vmi2d", "d", "aarch64_sme_fmla_single_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLA_SINGLE_VG1x4_F64 : Inst<"svmla[_single]_za64[_{d}]_vg1x4", "vmi4d", "d", "aarch64_sme_fmla_single_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLS_SINGLE_VG1x2_F64 : Inst<"svmls[_single]_za64[_{d}]_vg1x2", "vmi2d", "d", "aarch64_sme_fmls_single_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLS_SINGLE_VG1x4_F64 : Inst<"svmls[_single]_za64[_{d}]_vg1x4", "vmi4d", "d", "aarch64_sme_fmls_single_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + def SVMLA_LANE_VG1x2_F64 : Inst<"svmla_lane_za64[_{d}]_vg1x2", "vmi2di", "d", "aarch64_sme_fmla_lane_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_1>]>; + def SVMLA_LANE_VG1x4_F64 : Inst<"svmla_lane_za64[_{d}]_vg1x4", "vmi4di", "d", "aarch64_sme_fmla_lane_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_1>]>; + def SVMLS_LANE_VG1x2_F64 : Inst<"svmls_lane_za64[_{d}]_vg1x2", "vmi2di", "d", "aarch64_sme_fmls_lane_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_1>]>; + def SVMLS_LANE_VG1x4_F64 : Inst<"svmls_lane_za64[_{d}]_vg1x4", "vmi4di", "d", "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_1>]>; +} + +// FMLAL/FMLSL/UMLAL/SMLAL +// SMLALL/UMLALL/USMLALL/SUMLALL +let TargetGuard = "sme2" in { + // MULTI MLAL + def SVMLAL_MULTI_VG2x2_F16 : Inst<"svmla_za32[_{d}]_vg2x2", "vmi22", "bh", "aarch64_sme_fmlal_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLAL_MULTI_VG2x4_F16 : Inst<"svmla_za32[_{d}]_vg2x4", "vmi44", "bh", "aarch64_sme_fmlal_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLAL_MULTI_VG2x2_S16 : Inst<"svmla_za32[_{d}]_vg2x2", "vmi22", "s", "aarch64_sme_smlal_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLAL_MULTI_VG2x4_S16 : Inst<"svmla_za32[_{d}]_vg2x4", "vmi44", "s", "aarch64_sme_smlal_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLAL_MULTI_VG2x2_U16 : Inst<"svmla_za32[_{d}]_vg2x2", "vmi22", "Us", "aarch64_sme_umlal_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLAL_MULTI_VG2x4_U16 : Inst<"svmla_za32[_{d}]_vg2x4", "vmi44", "Us", "aarch64_sme_umlal_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + + def SVMLAL_MULTI_VG4x2_S8 : Inst<"svmla_za32[_{d}]_vg4x2", "vmi22", "c", "aarch64_sme_smla_za32_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLAL_MULTI_VG4x2_U8 : Inst<"svmla_za32[_{d}]_vg4x2", "vmi22", "Uc", "aarch64_sme_umla_za32_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLAL_MULTI_VG4x4_S8 : Inst<"svmla_za32[_{d}]_vg4x4", "vmi44", "c", "aarch64_sme_smla_za32_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLAL_MULTI_VG4x4_U8 : Inst<"svmla_za32[_{d}]_vg4x4", "vmi44", "Uc", "aarch64_sme_umla_za32_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + + let TargetGuard = "sme-i16i64" in { + def SVMLAL_MULTI_VG4x2_S16 : Inst<"svmla_za64[_{d}]_vg4x2", "vmi22", "s", "aarch64_sme_smla_za64_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLAL_MULTI_VG4x2_U16 : Inst<"svmla_za64[_{d}]_vg4x2", "vmi22", "Us", "aarch64_sme_umla_za64_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLAL_MULTI_VG4x4_S16 : Inst<"svmla_za64[_{d}]_vg4x4", "vmi44", "s", "aarch64_sme_smla_za64_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLAL_MULTI_VG4x4_U16 : Inst<"svmla_za64[_{d}]_vg4x4", "vmi44", "Us", "aarch64_sme_umla_za64_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + } + + // MULTI MLSL + def SVMLSL_MULTI_VG2x2_F16 : Inst<"svmls_za32[_{d}]_vg2x2", "vmi22", "bh", "aarch64_sme_fmlsl_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLSL_MULTI_VG2x4_F16 : Inst<"svmls_za32[_{d}]_vg2x4", "vmi44", "bh", "aarch64_sme_fmlsl_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLSL_MULTI_VG2x2_S16 : Inst<"svmls_za32[_{d}]_vg2x2", "vmi22", "s", "aarch64_sme_smlsl_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLSL_MULTI_VG2x4_S16 : Inst<"svmls_za32[_{d}]_vg2x4", "vmi44", "s", "aarch64_sme_smlsl_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLSL_MULTI_VG2x2_U16 : Inst<"svmls_za32[_{d}]_vg2x2", "vmi22", "Us", "aarch64_sme_umlsl_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLSL_MULTI_VG2x4_U16 : Inst<"svmls_za32[_{d}]_vg2x4", "vmi44", "Us", "aarch64_sme_umlsl_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + + def SVMLSL_MULTI_VG4x2_S8 : Inst<"svmls_za32[_{d}]_vg4x2", "vmi22", "c", "aarch64_sme_smls_za32_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLSL_MULTI_VG4x2_U8 : Inst<"svmls_za32[_{d}]_vg4x2", "vmi22", "Uc", "aarch64_sme_umls_za32_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLSL_MULTI_VG4x4_S8 : Inst<"svmls_za32[_{d}]_vg4x4", "vmi44", "c", "aarch64_sme_smls_za32_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLSL_MULTI_VG4x4_U8 : Inst<"svmls_za32[_{d}]_vg4x4", "vmi44", "Uc", "aarch64_sme_umls_za32_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + +let TargetGuard = "sme-i16i64" in { + def SVMLSL_MULTI_VG4x2_S16 : Inst<"svmls_za64[_{d}]_vg4x2", "vmi22", "s", "aarch64_sme_smls_za64_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLSL_MULTI_VG4x2_U16 : Inst<"svmls_za64[_{d}]_vg4x2", "vmi22", "Us", "aarch64_sme_umls_za64_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLSL_MULTI_VG4x4_S16 : Inst<"svmls_za64[_{d}]_vg4x4", "vmi44", "s", "aarch64_sme_smls_za64_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLSL_MULTI_VG4x4_U16 : Inst<"svmls_za64[_{d}]_vg4x4", "vmi44", "Us", "aarch64_sme_umls_za64_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + } + + // SINGLE MLAL + def SVMLAL_SINGLE_VG2x1_F16 : Inst<"svmla[_single]_za32[_{d}]_vg2x1", "vmidd", "bh", "aarch64_sme_fmlal_single_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>]>; + def SVMLAL_SINGLE_VG2x2_F16 : Inst<"svmla[_single]_za32[_{d}]_vg2x2", "vmi2d", "bh", "aarch64_sme_fmlal_single_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLAL_SINGLE_VG2x4_F16 : Inst<"svmla[_single]_za32[_{d}]_vg2x4", "vmi4d", "bh", "aarch64_sme_fmlal_single_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLAL_SINGLE_VG2x1_S16 : Inst<"svmla[_single]_za32[_{d}]_vg2x1", "vmidd", "s", "aarch64_sme_smlal_single_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>]>; + def SVMLAL_SINGLE_VG2x2_S16 : Inst<"svmla[_single]_za32[_{d}]_vg2x2", "vmi2d", "s", "aarch64_sme_smlal_single_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLAL_SINGLE_VG2x4_S16 : Inst<"svmla[_single]_za32[_{d}]_vg2x4", "vmi4d", "s", "aarch64_sme_smlal_single_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLAL_SINGLE_VG2x1_U16 : Inst<"svmla[_single]_za32[_{d}]_vg2x1", "vmidd", "Us", "aarch64_sme_umlal_single_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>]>; + def SVMLAL_SINGLE_VG2x2_U16 : Inst<"svmla[_single]_za32[_{d}]_vg2x2", "vmi2d", "Us", "aarch64_sme_umlal_single_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLAL_SINGLE_VG2x4_U16 : Inst<"svmla[_single]_za32[_{d}]_vg2x4", "vmi4d", "Us", "aarch64_sme_umlal_single_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + + def SVMLAL_SINGLE_VG4x1_S8 : Inst<"svmla[_single]_za32[_{d}]_vg4x1", "vmidd", "c", "aarch64_sme_smla_za32_single_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>]>; + def SVMLAL_SINGLE_VG4x1_U8 : Inst<"svmla[_single]_za32[_{d}]_vg4x1", "vmidd", "Uc", "aarch64_sme_umla_za32_single_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>]>; + def SVMLAL_SINGLE_VG4x2_S8 : Inst<"svmla[_single]_za32[_{d}]_vg4x2", "vmi2d", "c", "aarch64_sme_smla_za32_single_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLAL_SINGLE_VG4x2_U8 : Inst<"svmla[_single]_za32[_{d}]_vg4x2", "vmi2d", "Uc", "aarch64_sme_umla_za32_single_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLAL_SINGLE_VG4x4_S8 : Inst<"svmla[_single]_za32[_{d}]_vg4x4", "vmi4d", "c", "aarch64_sme_smla_za32_single_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLAL_SINGLE_VG4x4_U8 : Inst<"svmla[_single]_za32[_{d}]_vg4x4", "vmi4d", "Uc", "aarch64_sme_umla_za32_single_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + + let TargetGuard = "sme-i16i64" in { + def SVMLAL_SINGLE_VG4x1_S16 : Inst<"svmla[_single]_za64[_{d}]_vg4x1", "vmidd", "s", "aarch64_sme_smla_za64_single_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>]>; + def SVMLAL_SINGLE_VG4x1_U16 : Inst<"svmla[_single]_za64[_{d}]_vg4x1", "vmidd", "Us", "aarch64_sme_umla_za64_single_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>]>; + def SVMLAL_SINGLE_VG4x2_S16 : Inst<"svmla[_single]_za64[_{d}]_vg4x2", "vmi2d", "s", "aarch64_sme_smla_za64_single_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLAL_SINGLE_VG4x2_U16 : Inst<"svmla[_single]_za64[_{d}]_vg4x2", "vmi2d", "Us", "aarch64_sme_umla_za64_single_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLAL_SINGLE_VG4x4_S16 : Inst<"svmla[_single]_za64[_{d}]_vg4x4", "vmi4d", "s", "aarch64_sme_smla_za64_single_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLAL_SINGLE_VG4x4_U16 : Inst<"svmla[_single]_za64[_{d}]_vg4x4", "vmi4d", "Us", "aarch64_sme_umla_za64_single_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + } + + // SINGLE MLSL + def SVMLSL_SINGLE_VG2x1_F16 : Inst<"svmls[_single]_za32[_{d}]_vg2x1", "vmidd", "bh", "aarch64_sme_fmlsl_single_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>]>; + def SVMLSL_SINGLE_VG2x2_F16 : Inst<"svmls[_single]_za32[_{d}]_vg2x2", "vmi2d", "bh", "aarch64_sme_fmlsl_single_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLSL_SINGLE_VG2x4_F16 : Inst<"svmls[_single]_za32[_{d}]_vg2x4", "vmi4d", "bh", "aarch64_sme_fmlsl_single_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLSL_SINGLE_VG2x1_S16 : Inst<"svmls[_single]_za32[_{d}]_vg2x1", "vmidd", "s", "aarch64_sme_smlsl_single_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>]>; + def SVMLSL_SINGLE_VG2x2_S16 : Inst<"svmls[_single]_za32[_{d}]_vg2x2", "vmi2d", "s", "aarch64_sme_smlsl_single_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLSL_SINGLE_VG2x4_S16 : Inst<"svmls[_single]_za32[_{d}]_vg2x4", "vmi4d", "s", "aarch64_sme_smlsl_single_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLSL_SINGLE_VG2x1_U16 : Inst<"svmls[_single]_za32[_{d}]_vg2x1", "vmidd", "Us", "aarch64_sme_umlsl_single_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>]>; + def SVMLSL_SINGLE_VG2x2_U16 : Inst<"svmls[_single]_za32[_{d}]_vg2x2", "vmi2d", "Us", "aarch64_sme_umlsl_single_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLSL_SINGLE_VG2x4_U16 : Inst<"svmls[_single]_za32[_{d}]_vg2x4", "vmi4d", "Us", "aarch64_sme_umlsl_single_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + + def SVMLSL_SINGLE_VG4x1_S8 : Inst<"svmls[_single]_za32[_{d}]_vg4x1", "vmidd", "c", "aarch64_sme_smls_za32_single_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>]>; + def SVMLSL_SINGLE_VG4x1_U8 : Inst<"svmls[_single]_za32[_{d}]_vg4x1", "vmidd", "Uc", "aarch64_sme_umls_za32_single_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>]>; + def SVMLSL_SINGLE_VG4x2_S8 : Inst<"svmls[_single]_za32[_{d}]_vg4x2", "vmi2d", "c", "aarch64_sme_smls_za32_single_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLSL_SINGLE_VG4x2_U8 : Inst<"svmls[_single]_za32[_{d}]_vg4x2", "vmi2d", "Uc", "aarch64_sme_umls_za32_single_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLSL_SINGLE_VG4x4_S8 : Inst<"svmls[_single]_za32[_{d}]_vg4x4", "vmi4d", "c", "aarch64_sme_smls_za32_single_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLSL_SINGLE_VG4x4_U8 : Inst<"svmls[_single]_za32[_{d}]_vg4x4", "vmi4d", "Uc", "aarch64_sme_umls_za32_single_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + + let TargetGuard = "sme-i16i64" in { + def SVMLSL_SINGLE_VG4x1_S16 : Inst<"svmls[_single]_za64[_{d}]_vg4x1", "vmidd", "s", "aarch64_sme_smls_za64_single_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>]>; + def SVMLSL_SINGLE_VG4x1_U16 : Inst<"svmls[_single]_za64[_{d}]_vg4x1", "vmidd", "Us", "aarch64_sme_umls_za64_single_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>]>; + def SVMLSL_SINGLE_VG4x2_S16 : Inst<"svmls[_single]_za64[_{d}]_vg4x2", "vmi2d", "s", "aarch64_sme_smls_za64_single_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLSL_SINGLE_VG4x2_U16 : Inst<"svmls[_single]_za64[_{d}]_vg4x2", "vmi2d", "Us", "aarch64_sme_umls_za64_single_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLSL_SINGLE_VG4x4_S16 : Inst<"svmls[_single]_za64[_{d}]_vg4x4", "vmi4d", "s", "aarch64_sme_smls_za64_single_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLSL_SINGLE_VG4x4_U16 : Inst<"svmls[_single]_za64[_{d}]_vg4x4", "vmi4d", "Us", "aarch64_sme_umls_za64_single_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + } + + // INDEXED MLAL + def SVMLAL_LANE_VG2x1_F16 : Inst<"svmla_lane_za32[_{d}]_vg2x1", "vmiddi", "bh", "aarch64_sme_fmlal_lane_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLAL_LANE_VG2x2_F16 : Inst<"svmla_lane_za32[_{d}]_vg2x2", "vmi2di", "bh", "aarch64_sme_fmlal_lane_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLAL_LANE_VG2x4_F16 : Inst<"svmla_lane_za32[_{d}]_vg2x4", "vmi4di", "bh", "aarch64_sme_fmlal_lane_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLAL_LANE_VG2x1_S16 : Inst<"svmla_lane_za32[_{d}]_vg2x1", "vmiddi", "s", "aarch64_sme_smlal_lane_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLAL_LANE_VG2x2_S16 : Inst<"svmla_lane_za32[_{d}]_vg2x2", "vmi2di", "s", "aarch64_sme_smlal_lane_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLAL_LANE_VG2x4_S16 : Inst<"svmla_lane_za32[_{d}]_vg2x4", "vmi4di", "s", "aarch64_sme_smlal_lane_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLAL_LANE_VG2x1_U16 : Inst<"svmla_lane_za32[_{d}]_vg2x1", "vmiddi", "Us", "aarch64_sme_umlal_lane_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLAL_LANE_VG2x2_U16 : Inst<"svmla_lane_za32[_{d}]_vg2x2", "vmi2di", "Us", "aarch64_sme_umlal_lane_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLAL_LANE_VG2x4_U16 : Inst<"svmla_lane_za32[_{d}]_vg2x4", "vmi4di", "Us", "aarch64_sme_umlal_lane_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + + def SVMLAL_LANE_VG4x1_S8 : Inst<"svmla_lane_za32[_{d}]_vg4x1", "vmiddi", "c", "aarch64_sme_smla_za32_lane_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + def SVMLAL_LANE_VG4x1_U8 : Inst<"svmla_lane_za32[_{d}]_vg4x1", "vmiddi", "Uc", "aarch64_sme_umla_za32_lane_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + def SVMLAL_LANE_VG4x2_S8 : Inst<"svmla_lane_za32[_{d}]_vg4x2", "vmi2di", "c", "aarch64_sme_smla_za32_lane_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + def SVMLAL_LANE_VG4x2_U8 : Inst<"svmla_lane_za32[_{d}]_vg4x2", "vmi2di", "Uc", "aarch64_sme_umla_za32_lane_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + def SVMLAL_LANE_VG4x4_S8 : Inst<"svmla_lane_za32[_{d}]_vg4x4", "vmi4di", "c", "aarch64_sme_smla_za32_lane_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + def SVMLAL_LANE_VG4x4_U8 : Inst<"svmla_lane_za32[_{d}]_vg4x4", "vmi4di", "Uc", "aarch64_sme_umla_za32_lane_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + + let TargetGuard = "sme-i16i64" in { + def SVMLAL_LANE_VG4x1_S16 : Inst<"svmla_lane_za64[_{d}]_vg4x1", "vmiddi", "s", "aarch64_sme_smla_za64_lane_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLAL_LANE_VG4x1_U16 : Inst<"svmla_lane_za64[_{d}]_vg4x1", "vmiddi", "Us", "aarch64_sme_umla_za64_lane_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLAL_LANE_VG4x2_S16 : Inst<"svmla_lane_za64[_{d}]_vg4x2", "vmi2di", "s", "aarch64_sme_smla_za64_lane_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLAL_LANE_VG4x2_U16 : Inst<"svmla_lane_za64[_{d}]_vg4x2", "vmi2di", "Us", "aarch64_sme_umla_za64_lane_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLAL_LANE_VG4x4_S16 : Inst<"svmla_lane_za64[_{d}]_vg4x4", "vmi4di", "s", "aarch64_sme_smla_za64_lane_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLAL_LANE_VG4x4_U16 : Inst<"svmla_lane_za64[_{d}]_vg4x4", "vmi4di", "Us", "aarch64_sme_umla_za64_lane_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_7>]>; + } + + // INDEXED MLSL + def SVMLSL_LANE_VG2x1_F16 : Inst<"svmls_lane_za32[_{d}]_vg2x1", "vmiddi", "bh", "aarch64_sme_fmlsl_lane_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLSL_LANE_VG2x2_F16 : Inst<"svmls_lane_za32[_{d}]_vg2x2", "vmi2di", "bh", "aarch64_sme_fmlsl_lane_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLSL_LANE_VG2x4_F16 : Inst<"svmls_lane_za32[_{d}]_vg2x4", "vmi4di", "bh", "aarch64_sme_fmlsl_lane_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLSL_LANE_VG2x1_S16 : Inst<"svmls_lane_za32[_{d}]_vg2x1", "vmiddi", "s", "aarch64_sme_smlsl_lane_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLSL_LANE_VG2x2_S16 : Inst<"svmls_lane_za32[_{d}]_vg2x2", "vmi2di", "s", "aarch64_sme_smlsl_lane_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLSL_LANE_VG2x4_S16 : Inst<"svmls_lane_za32[_{d}]_vg2x4", "vmi4di", "s", "aarch64_sme_smlsl_lane_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLSL_LANE_VG2x1_U16 : Inst<"svmls_lane_za32[_{d}]_vg2x1", "vmiddi", "Us", "aarch64_sme_umlsl_lane_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLSL_LANE_VG2x2_U16 : Inst<"svmls_lane_za32[_{d}]_vg2x2", "vmi2di", "Us", "aarch64_sme_umlsl_lane_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLSL_LANE_VG2x4_U16 : Inst<"svmls_lane_za32[_{d}]_vg2x4", "vmi4di", "Us", "aarch64_sme_umlsl_lane_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + + def SVMLSL_LANE_VG4x1_S8 : Inst<"svmls_lane_za32[_{d}]_vg4x1", "vmiddi", "c", "aarch64_sme_smls_za32_lane_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + def SVMLSL_LANE_VG4x1_U8 : Inst<"svmls_lane_za32[_{d}]_vg4x1", "vmiddi", "Uc", "aarch64_sme_umls_za32_lane_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + def SVMLSL_LANE_VG4x2_S8 : Inst<"svmls_lane_za32[_{d}]_vg4x2", "vmi2di", "c", "aarch64_sme_smls_za32_lane_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + def SVMLSL_LANE_VG4x2_U8 : Inst<"svmls_lane_za32[_{d}]_vg4x2", "vmi2di", "Uc", "aarch64_sme_umls_za32_lane_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + def SVMLSL_LANE_VG4x4_S8 : Inst<"svmls_lane_za32[_{d}]_vg4x4", "vmi4di", "c", "aarch64_sme_smls_za32_lane_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + def SVMLSL_LANE_VG4x4_U8 : Inst<"svmls_lane_za32[_{d}]_vg4x4", "vmi4di", "Uc", "aarch64_sme_umls_za32_lane_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + + let TargetGuard = "sme-i16i64" in { + def SVMLSL_LANE_VG4x1_S16 : Inst<"svmls_lane_za64[_{d}]_vg4x1", "vmiddi", "s", "aarch64_sme_smls_za64_lane_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLSL_LANE_VG4x1_U16 : Inst<"svmls_lane_za64[_{d}]_vg4x1", "vmiddi", "Us", "aarch64_sme_umls_za64_lane_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLSL_LANE_VG4x2_S16 : Inst<"svmls_lane_za64[_{d}]_vg4x2", "vmi2di", "s", "aarch64_sme_smls_za64_lane_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLSL_LANE_VG4x2_U16 : Inst<"svmls_lane_za64[_{d}]_vg4x2", "vmi2di", "Us", "aarch64_sme_umls_za64_lane_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLSL_LANE_VG4x4_S16 : Inst<"svmls_lane_za64[_{d}]_vg4x4", "vmi4di", "s", "aarch64_sme_smls_za64_lane_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLSL_LANE_VG4x4_U16 : Inst<"svmls_lane_za64[_{d}]_vg4x4", "vmi4di", "Us", "aarch64_sme_umls_za64_lane_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_7>]>; + } + + // SINGLE SUMLALL + def SVSUMLALL_SINGLE_VG4x2 : Inst<"svsumla[_single]_za32[_u8]_vg4x2", "vmi2.xd", "Uc", "aarch64_sme_sumla_za32_single_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVSUMLALL_SINGLE_VG4x4 : Inst<"svsumla[_single]_za32[_u8]_vg4x4", "vmi4.xd", "Uc", "aarch64_sme_sumla_za32_single_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + + // INDEXED SUMLALL + def SVSUMLALL_LANE_VG4x1 : Inst<"svsumla_lane_za32[_{d}]_vg4x1", "vmidui", "c", "aarch64_sme_sumla_za32_lane_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + def SVSUMLALL_LANE_VG4x2 : Inst<"svsumla_lane_za32[_{d}]_vg4x2", "vmi2ui", "c", "aarch64_sme_sumla_za32_lane_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + def SVSUMLALL_LANE_VG4x4 : Inst<"svsumla_lane_za32[_{d}]_vg4x4", "vmi4ui", "c", "aarch64_sme_sumla_za32_lane_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + + // SINGLE USMLALL + def SVUSMLALL_SINGLE_VG4x1 : Inst<"svusmla[_single]_za32[_{d}]_vg4x1", "vmiud", "c", "aarch64_sme_usmla_za32_single_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>]>; + def SVUSMLALL_SINGLE_VG4x2 : Inst<"svusmla[_single]_za32[_{d}]_vg4x2", "vmi2.ud", "c", "aarch64_sme_usmla_za32_single_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVUSMLALL_SINGLE_VG4x4 : Inst<"svusmla[_single]_za32[_{d}]_vg4x4", "vmi4.ud", "c", "aarch64_sme_usmla_za32_single_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + + // MULTI USMLALL + def SVUSMLALL_MULTI_VG4x2 : Inst<"svusmla_za32[_{d}]_vg4x2", "vmi2.u2", "c", "aarch64_sme_usmla_za32_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVUSMLALL_MULTI_VG4x4 : Inst<"svusmla_za32[_{d}]_vg4x4", "vmi4.u4", "c", "aarch64_sme_usmla_za32_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + + // INDEXED USMLALL + def SVUSMLALL_LANE_VG4x1 : Inst<"svusmla_lane_za32[_{d}]_vg4x1", "vmidxi", "Uc", "aarch64_sme_usmla_za32_lane_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + def SVUSMLALL_LANE_VG4x2 : Inst<"svusmla_lane_za32[_{d}]_vg4x2", "vmi2xi", "Uc", "aarch64_sme_usmla_za32_lane_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + def SVUSMLALL_LANE_VG4x4 : Inst<"svusmla_lane_za32[_{d}]_vg4x4", "vmi4xi", "Uc", "aarch64_sme_usmla_za32_lane_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + + // VERTICAL DOT-PRODUCT + def SVVDOT_LANE_ZA32_VG1x2_S : Inst<"svvdot_lane_za32[_{d}]_vg1x2", "vmi2di", "s", "aarch64_sme_svdot_lane_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + def SVVDOT_LANE_ZA32_VG1x4_S : Inst<"svvdot_lane_za32[_{d}]_vg1x4", "vmi4di", "c", "aarch64_sme_svdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + def SVVDOT_LANE_ZA32_VG1x2_U : Inst<"svvdot_lane_za32[_{d}]_vg1x2", "vmi2di", "Us", "aarch64_sme_uvdot_lane_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + def SVVDOT_LANE_ZA32_VG1x4_U : Inst<"svvdot_lane_za32[_{d}]_vg1x4", "vmi4di", "Uc", "aarch64_sme_uvdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + def SVVDOT_LANE_ZA32_VG1x2_F : Inst<"svvdot_lane_za32[_{d}]_vg1x2", "vmi2di", "hb", "aarch64_sme_fvdot_lane_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + def SVSUVDOT_LANE_ZA32_VG1x4 : Inst<"svsuvdot_lane_za32[_{d}]_vg1x4", "vmi4di", "c", "aarch64_sme_suvdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + def SVUSVDOT_LANE_ZA32_VG1x4 : Inst<"svusvdot_lane_za32[_{d}]_vg1x4", "vmi4di", "Uc", "aarch64_sme_usvdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + + let TargetGuard = "sme-i16i64" in { + def SVVDOT_LANE_ZA64_VG1x4_S : Inst<"svvdot_lane_za64[_{d}]_vg1x4", "vmi4di", "s", "aarch64_sme_svdot_lane_za64_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_1>]>; + def SVVDOT_LANE_ZA64_VG1x4_U : Inst<"svvdot_lane_za64[_{d}]_vg1x4", "vmi4di", "Us", "aarch64_sme_uvdot_lane_za64_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_1>]>; + } + + // Multi-vector signed & unsigned integer dot-product + def SVDOT_SINGLE_ZA32_VG1x2_S : Inst<"svdot[_single]_za32[_{d}]_vg1x2", "vmi2d", "cs", "aarch64_sme_sdot_single_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_SINGLE_ZA32_VG1x4_S : Inst<"svdot[_single]_za32[_{d}]_vg1x4", "vmi4d", "cs", "aarch64_sme_sdot_single_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_SINGLE_ZA32_VG1x2_U : Inst<"svdot[_single]_za32[_{d}]_vg1x2", "vmi2d", "UcUs", "aarch64_sme_udot_single_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_SINGLE_ZA32_VG1x4_U : Inst<"svdot[_single]_za32[_{d}]_vg1x4", "vmi4d", "UcUs", "aarch64_sme_udot_single_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_MULTI_ZA32_VG1x2_S : Inst<"svdot_za32[_{d}]_vg1x2", "vmi22", "cs", "aarch64_sme_sdot_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_MULTI_ZA32_VG1x4_S : Inst<"svdot_za32[_{d}]_vg1x4", "vmi44", "cs", "aarch64_sme_sdot_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_MULTI_ZA32_VG1x2_U : Inst<"svdot_za32[_{d}]_vg1x2", "vmi22", "UcUs", "aarch64_sme_udot_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_MULTI_ZA32_VG1x4_U : Inst<"svdot_za32[_{d}]_vg1x4", "vmi44", "UcUs", "aarch64_sme_udot_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_LANE_ZA32_VG1x2_S : Inst<"svdot_lane_za32[_{d}]_vg1x2", "vmi2di", "cs", "aarch64_sme_sdot_lane_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + def SVDOT_LANE_ZA32_VG1x4_S : Inst<"svdot_lane_za32[_{d}]_vg1x4", "vmi4di", "cs", "aarch64_sme_sdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + def SVDOT_LANE_ZA32_VG1x2_U : Inst<"svdot_lane_za32[_{d}]_vg1x2", "vmi2di", "UcUs", "aarch64_sme_udot_lane_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + def SVDOT_LANE_ZA32_VG1x4_U : Inst<"svdot_lane_za32[_{d}]_vg1x4", "vmi4di", "UcUs", "aarch64_sme_udot_lane_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + + def SVUSDOT_SINGLE_ZA32_VG1x2 : Inst<"svusdot[_single]_za32[_{d}]_vg1x2", "vmi2.dx", "Uc", "aarch64_sme_usdot_single_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVUSDOT_SINGLE_ZA32_VG1x4 : Inst<"svusdot[_single]_za32[_{d}]_vg1x4", "vmi4.dx", "Uc", "aarch64_sme_usdot_single_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVUSDOT_MULTI_ZA32_VG1x2 : Inst<"svusdot_za32[_{d}]_vg1x2", "vmi2.d2.x", "Uc", "aarch64_sme_usdot_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVUSDOT_MULTI_ZA32_VG1x4 : Inst<"svusdot_za32[_{d}]_vg1x4", "vmi4.d4.x", "Uc", "aarch64_sme_usdot_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVUSDOT_LANE_ZA32_VG1x2 : Inst<"svusdot_lane_za32[_{d}]_vg1x2", "vmi2.dxi", "Uc", "aarch64_sme_usdot_lane_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + def SVUSDOT_LANE_ZA32_VG1x4 : Inst<"svusdot_lane_za32[_{d}]_vg1x4", "vmi4.dxi", "Uc", "aarch64_sme_usdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + + def SVSUDOT_SINGLE_ZA32_VG1x2 : Inst<"svsudot[_single]_za32[_{d}]_vg1x2", "vmi2.du", "c", "aarch64_sme_sudot_single_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVSUDOT_SINGLE_ZA32_VG1x4 : Inst<"svsudot[_single]_za32[_{d}]_vg1x4", "vmi4.du", "c", "aarch64_sme_sudot_single_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVSUDOT_LANE_ZA32_VG1x2 : Inst<"svsudot_lane_za32[_{d}]_vg1x2", "vmi2.dui", "c", "aarch64_sme_sudot_lane_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + def SVSUDOT_LANE_ZA32_VG1x4 : Inst<"svsudot_lane_za32[_{d}]_vg1x4", "vmi4.dui", "c", "aarch64_sme_sudot_lane_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + + let TargetGuard = "sme-i16i64" in { + def SVDOT_SINGLE_ZA64_VG1x2_S16 : Inst<"svdot[_single]_za64[_{d}]_vg1x2", "vmi2d", "s", "aarch64_sme_sdot_single_za64_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_SINGLE_ZA64_VG1x4_S16 : Inst<"svdot[_single]_za64[_{d}]_vg1x4", "vmi4d", "s", "aarch64_sme_sdot_single_za64_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_SINGLE_ZA64_VG1x2_U16 : Inst<"svdot[_single]_za64[_{d}]_vg1x2", "vmi2d", "Us", "aarch64_sme_udot_single_za64_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_SINGLE_ZA64_VG1x4_U16 : Inst<"svdot[_single]_za64[_{d}]_vg1x4", "vmi4d", "Us", "aarch64_sme_udot_single_za64_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_MULTI_ZA64_VG1x2_S16 : Inst<"svdot_za64[_{d}]_vg1x2", "vmi22", "s", "aarch64_sme_sdot_za64_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_MULTI_ZA64_VG1x4_S16 : Inst<"svdot_za64[_{d}]_vg1x4", "vmi44", "s", "aarch64_sme_sdot_za64_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_MULTI_ZA64_VG1x2_U16 : Inst<"svdot_za64[_{d}]_vg1x2", "vmi22", "Us", "aarch64_sme_udot_za64_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_MULTI_ZA64_VG1x4_U16 : Inst<"svdot_za64[_{d}]_vg1x4", "vmi44", "Us", "aarch64_sme_udot_za64_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_LANE_ZA64_VG1x2_S16 : Inst<"svdot_lane_za64[_{d}]_vg1x2", "vmi2di", "s", "aarch64_sme_sdot_lane_za64_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_1>]>; + def SVDOT_LANE_ZA64_VG1x4_S16 : Inst<"svdot_lane_za64[_{d}]_vg1x4", "vmi4di", "s", "aarch64_sme_sdot_lane_za64_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_1>]>; + def SVDOT_LANE_ZA64_VG1x2_U16 : Inst<"svdot_lane_za64[_{d}]_vg1x2", "vmi2di", "Us", "aarch64_sme_udot_lane_za64_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_1>]>; + def SVDOT_LANE_ZA64_VG1x4_U16 : Inst<"svdot_lane_za64[_{d}]_vg1x4", "vmi4di", "Us", "aarch64_sme_udot_lane_za64_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_1>]>; + } + + // Multi-vector half-precision/BFloat16 floating-point dot-product + def SVDOT_SINGLE_ZA32_VG1x2_F16 : Inst<"svdot[_single]_za32[_{d}]_vg1x2", "vmi2d", "bh", "aarch64_sme_fdot_single_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_SINGLE_ZA32_VG1x4_F16 : Inst<"svdot[_single]_za32[_{d}]_vg1x4", "vmi4d", "bh", "aarch64_sme_fdot_single_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_MULTI_ZA32_VG1x2_F16 : Inst<"svdot_za32[_{d}]_vg1x2", "vmi22", "bh", "aarch64_sme_fdot_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_MULTI_ZA32_VG1x4_F16 : Inst<"svdot_za32[_{d}]_vg1x4", "vmi44", "bh", "aarch64_sme_fdot_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_LANE_ZA32_VG1x2_F16 : Inst<"svdot_lane_za32[_{d}]_vg1x2", "vmi2di", "bh", "aarch64_sme_fdot_lane_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + def SVDOT_LANE_ZA32_VG1x4_F16 : Inst<"svdot_lane_za32[_{d}]_vg1x4", "vmi4di", "bh", "aarch64_sme_fdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; +} + +// == ADD /SUB == +let TargetGuard = "sme2" in { + def SVADD_WRITE_SINGLE_ZA32_VG1X2_I32 : Inst<"svadd_write[_single]_za32[_{d}]_vg1x2", "vmi2d", "iUi", "aarch64_sme_add_write_single_za_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVADD_WRITE_SINGLE_ZA32_VG1X4_I32 : Inst<"svadd_write[_single]_za32[_{d}]_vg1x4", "vmi4d", "iUi", "aarch64_sme_add_write_single_za_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + def SVSUB_WRITE_SINGLE_ZA32_VG1X2_I32 : Inst<"svsub_write[_single]_za32[_{d}]_vg1x2", "vmi2d", "Ui", "aarch64_sme_sub_write_single_za_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVSUB_WRITE_SINGLE_ZA32_VG1X4_I32 : Inst<"svsub_write[_single]_za32[_{d}]_vg1x4", "vmi4d", "Ui", "aarch64_sme_sub_write_single_za_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + def SVADD_WRITE_ZA32_VG1x2_I32 : Inst<"svadd_write_za32[_{d}]_vg1x2", "vmi22", "iUi", "aarch64_sme_add_write_za_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVADD_WRITE_ZA32_VG1x4_I32 : Inst<"svadd_write_za32[_{d}]_vg1x4", "vmi44", "iUi", "aarch64_sme_add_write_za_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + def SVSUB_WRITE_ZA32_VG1x2_I32 : Inst<"svsub_write_za32[_{d}]_vg1x2", "vmi22", "Ui", "aarch64_sme_sub_write_za_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVSUB_WRITE_ZA32_VG1x4_I32 : Inst<"svsub_write_za32[_{d}]_vg1x4", "vmi44", "Ui", "aarch64_sme_sub_write_za_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + def SVADD_ZA32_VG1x2_I32 : Inst<"svadd_za32[_{d}]_vg1x2", "vmi2", "iUif", "aarch64_sme_add_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVADD_ZA32_VG1X4_I32 : Inst<"svadd_za32[_{d}]_vg1x4", "vmi4", "iUif", "aarch64_sme_add_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + def SVSUB_ZA32_VG1X2_I32 : Inst<"svsub_za32[_{d}]_vg1x2", "vmi2", "Uif", "aarch64_sme_sub_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVSUB_ZA32_VG1X4_I32 : Inst<"svsub_za32[_{d}]_vg1x4", "vmi4", "Uif", "aarch64_sme_sub_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + let TargetGuard = "sme-i16i64" in { + def SVADD_WRITE_SINGLE_ZA64_VG1X2_I64 : Inst<"svadd_write[_single]_za64[_{d}]_vg1x2", "vmi2d", "lUl", "aarch64_sme_add_write_single_za_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVADD_WRITE_SINGLE_ZA64_VG1X4_I64 : Inst<"svadd_write[_single]_za64[_{d}]_vg1x4", "vmi4d", "lUl", "aarch64_sme_add_write_single_za_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + def SVSUB_WRITE_SINGLE_ZA64_VG1X2_I64 : Inst<"svsub_write[_single]_za64[_{d}]_vg1x2", "vmi2d", "Ul", "aarch64_sme_sub_write_single_za_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVSUB_WRITE_SINGLE_ZA64_VG1X4_I64 : Inst<"svsub_write[_single]_za64[_{d}]_vg1x4", "vmi4d", "Ul", "aarch64_sme_sub_write_single_za_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + def SVADD_WRITE_ZA64_VG1x2_I64 : Inst<"svadd_write_za64[_{d}]_vg1x2", "vmi22", "lUl", "aarch64_sme_add_write_za_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVADD_WRITE_ZA64_VG1x4_I64 : Inst<"svadd_write_za64[_{d}]_vg1x4", "vmi44", "lUl", "aarch64_sme_add_write_za_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + def SVSUB_WRITE_ZA64_VG1x2_I64 : Inst<"svsub_write_za64[_{d}]_vg1x2", "vmi22", "Ul", "aarch64_sme_sub_write_za_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVSUB_WRITE_ZA64_VG1x4_I64 : Inst<"svsub_write_za64[_{d}]_vg1x4", "vmi44", "Ul", "aarch64_sme_sub_write_za_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + def SVADD_ZA64_VG1X2_I64 : Inst<"svadd_za64[_{d}]_vg1x2", "vmi2", "lUl", "aarch64_sme_add_za64_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVADD_ZA64_VG1X4_I64 : Inst<"svadd_za64[_{d}]_vg1x4", "vmi4", "lUl", "aarch64_sme_add_za64_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + def SVSUB_ZA64_VG1X2_I64 : Inst<"svsub_za64[_{d}]_vg1x2", "vmi2", "Ul", "aarch64_sme_sub_za64_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVSUB_ZA64_VG1X4_I64 : Inst<"svsub_za64[_{d}]_vg1x4", "vmi4", "Ul", "aarch64_sme_sub_za64_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + } + + let TargetGuard = "sme-f64f64" in { + def SVADD_ZA64_VG1X2_F64 : Inst<"svadd_za64[_{d}]_vg1x2", "vmi2", "d", "aarch64_sme_add_za64_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVADD_ZA64_VG1X4_F64 : Inst<"svadd_za64[_{d}]_vg1x4", "vmi4", "d", "aarch64_sme_add_za64_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + def SVSUB_ZA64_VG1X2_F64 : Inst<"svsub_za64[_{d}]_vg1x2", "vmi2", "d", "aarch64_sme_sub_za64_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVSUB_ZA64_VG1X4_F64 : Inst<"svsub_za64[_{d}]_vg1x4", "vmi4", "d", "aarch64_sme_sub_za64_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + } +} +// +// 2 and 4 vector-group read/write intrinsics. +// + +multiclass WriteHV_VG checkvg2, list checkvg4> { + def NAME # _VG2_H : Inst<"svwrite_hor_" # n # "_vg2", "vimi2", t, i # "_hor_vg2", [IsSharedZA, IsStreaming, IsTileBaseOffsetIntr], checkvg2>; + def NAME # _VG2_V : Inst<"svwrite_ver_" # n # "_vg2", "vimi2", t, i # "_ver_vg2", [IsSharedZA, IsStreaming, IsTileBaseOffsetIntr], checkvg2>; + def NAME # _VG4_H : Inst<"svwrite_hor_" # n # "_vg4", "vimi4", t, i # "_hor_vg4", [IsSharedZA, IsStreaming, IsTileBaseOffsetIntr], checkvg4>; + def NAME # _VG4_V : Inst<"svwrite_ver_" # n # "_vg4", "vimi4", t, i # "_ver_vg4", [IsSharedZA, IsStreaming, IsTileBaseOffsetIntr], checkvg4>; +} + +let TargetGuard = "sme2" in { + defm SVWRITE_ZA8 : WriteHV_VG<"za8[_{d}]", "cUc", "aarch64_sme_write", [ImmCheck<0, ImmCheck0>, ImmCheck<2, ImmCheck0_14_Mul2>], [ImmCheck<0, ImmCheck0>, ImmCheck<2, ImmCheck0_12_Mul4>]>; + defm SVWRITE_ZA16 : WriteHV_VG<"za16[_{d}]", "sUshb", "aarch64_sme_write", [ImmCheck<0, ImmCheck0_1>, ImmCheck<2, ImmCheck0_6_Mul2>], [ImmCheck<0, ImmCheck0_1>, ImmCheck<2, ImmCheck0_4_Mul4>]>; + defm SVWRITE_ZA32 : WriteHV_VG<"za32[_{d}]", "iUif", "aarch64_sme_write", [ImmCheck<0, ImmCheck0_3>, ImmCheck<2, ImmCheck0_2_Mul2>], [ImmCheck<0, ImmCheck0_3>, ImmCheck<2, ImmCheck0>]>; + defm SVWRITE_ZA64 : WriteHV_VG<"za64[_{d}]", "lUld", "aarch64_sme_write", [ImmCheck<0, ImmCheck0_7>, ImmCheck<2, ImmCheck0>], [ImmCheck<0, ImmCheck0_7>, ImmCheck<2, ImmCheck0>]>; +} + +multiclass ReadHV_VG checkvg2, list checkvg4> { + def NAME # _VG2_H : Inst<"svread_hor_" # n # "_vg2", "2imi", t, i # "_hor_vg2", [IsSharedZA, IsPreservedZA, IsStreaming, IsTileBaseOffsetIntr], checkvg2>; + def NAME # _VG2_V : Inst<"svread_ver_" # n # "_vg2", "2imi", t, i # "_ver_vg2", [IsSharedZA, IsPreservedZA, IsStreaming, IsTileBaseOffsetIntr], checkvg2>; + def NAME # _VG4_H : Inst<"svread_hor_" # n # "_vg4", "4imi", t, i # "_hor_vg4", [IsSharedZA, IsPreservedZA, IsStreaming, IsTileBaseOffsetIntr], checkvg4>; + def NAME # _VG4_V : Inst<"svread_ver_" # n # "_vg4", "4imi", t, i # "_ver_vg4", [IsSharedZA, IsPreservedZA, IsStreaming, IsTileBaseOffsetIntr], checkvg4>; +} + +let TargetGuard = "sme2" in { + defm SVREAD_ZA8 : ReadHV_VG<"za8_{d}", "cUc", "aarch64_sme_read", [ImmCheck<0, ImmCheck0>, ImmCheck<2, ImmCheck0_14_Mul2>], [ImmCheck<0, ImmCheck0>, ImmCheck<2, ImmCheck0_12_Mul4>]>; + defm SVREAD_ZA16 : ReadHV_VG<"za16_{d}", "sUshb", "aarch64_sme_read", [ImmCheck<0, ImmCheck0_1>, ImmCheck<2, ImmCheck0_6_Mul2>], [ImmCheck<0, ImmCheck0_1>, ImmCheck<2, ImmCheck0_4_Mul4>]>; + defm SVREAD_ZA32 : ReadHV_VG<"za32_{d}", "iUif", "aarch64_sme_read", [ImmCheck<0, ImmCheck0_3>, ImmCheck<2, ImmCheck0_2_Mul2>], [ImmCheck<0, ImmCheck0_3>, ImmCheck<2, ImmCheck0>]>; + defm SVREAD_ZA64 : ReadHV_VG<"za64_{d}", "lUld", "aarch64_sme_read", [ImmCheck<0, ImmCheck0_7>, ImmCheck<2, ImmCheck0>], [ImmCheck<0, ImmCheck0_7>, ImmCheck<2, ImmCheck0>]>; +} + + +// +// Single vector-group read/write intrinsics. +// + +let TargetGuard = "sme2" in { + def SVWRITE_ZA64_VG1x2 : Inst<"svwrite_za64[_{d}]_vg1x2", "vmi2", "lUld", "aarch64_sme_write_vg1x2", [IsSharedZA, IsStreaming, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVWRITE_ZA64_VG1x4 : Inst<"svwrite_za64[_{d}]_vg1x4", "vmi4", "lUld", "aarch64_sme_write_vg1x4", [IsSharedZA, IsStreaming, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVREAD_ZA64_VG1x2 : Inst<"svread_za64_{d}_vg1x2", "2mi", "lUld", "aarch64_sme_read_vg1x2", [IsSharedZA, IsPreservedZA, IsStreaming, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVREAD_ZA64_VG1x4 : Inst<"svread_za64_{d}_vg1x4", "4mi", "lUld", "aarch64_sme_read_vg1x4", [IsSharedZA, IsPreservedZA, IsStreaming, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; +} + +// +// Outer produce and accumulate/subtract +// + +let TargetGuard = "sme2" in { + def SVSMOPA : Inst<"svmopa_za32[_{d}]_m", "viPPdd", "s", "aarch64_sme_smopa_za32", [IsSharedZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>; + def SVUSMOPA : Inst<"svmopa_za32[_{d}]_m", "viPPdd", "Us", "aarch64_sme_umopa_za32", [IsSharedZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>; + + def SVSMOPS : Inst<"svmops_za32[_{d}]_m", "viPPdd", "s", "aarch64_sme_smops_za32", [IsSharedZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>; + def SVUSMOPS : Inst<"svmops_za32[_{d}]_m", "viPPdd", "Us", "aarch64_sme_umops_za32", [IsSharedZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>; + + def SVBMOPA : Inst<"svbmopa_za32[_{d}]_m", "viPPdd", "Ui", "aarch64_sme_bmopa_za32", [IsSharedZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>; + + def SVBMOPS : Inst<"svbmops_za32[_{d}]_m", "viPPdd", "Ui", "aarch64_sme_bmops_za32", [IsSharedZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>; +} + +// +// Spill and fill of ZT0 +// + +let TargetGuard = "sme2" in { + def SVLDR_ZT : Inst<"svldr_zt", "viQ", "", "aarch64_sme_ldr_zt", [IsOverloadNone, IsStreamingCompatible, IsSharedZA, IsPreservedZA], [ImmCheck<0, ImmCheck0>]>; + def SVSTR_ZT : Inst<"svstr_zt", "vi{", "", "aarch64_sme_str_zt", [IsOverloadNone, IsStreamingCompatible, IsSharedZA, IsPreservedZA], [ImmCheck<0, ImmCheck0>]>; +} + +// +// Zero ZT0 +// + +let TargetGuard = "sme2" in { + def SVZERO_ZT : Inst<"svzero_zt", "vi", "", "aarch64_sme_zero_zt", [IsOverloadNone, IsStreamingCompatible, IsSharedZA, IsPreservedZA], [ImmCheck<0, ImmCheck0>]>; +} + +// +// lookup table expand one register +// + +let TargetGuard = "sme2" in { + def SVLUTI2_LANE_ZT : Inst<"svluti2_lane_zt[_{d}]", "didi", "UcUsUi", "aarch64_sme_luti2_lane_zt", [IsStreaming, IsSharedZA, IsPreservedZA], [ImmCheck<0, ImmCheck0>, ImmCheck<2, ImmCheck0_15>]>; + def SVLUTI4_LANE_ZT : Inst<"svluti4_lane_zt[_{d}]", "didi", "UcUsUi", "aarch64_sme_luti4_lane_zt", [IsStreaming, IsSharedZA, IsPreservedZA], [ImmCheck<0, ImmCheck0>, ImmCheck<2, ImmCheck0_7>]>; +} + +// +// lookup table expand two contiguous registers +// + +let TargetGuard = "sme2" in { + def SVLUTI2_LANE_ZT_X2 : Inst<"svluti2_lane_zt[_{d}]_x2", "2.didi", "UcUsUi", "aarch64_sme_luti2_lane_zt_x2", [IsStreaming, IsSharedZA, IsPreservedZA], [ImmCheck<0, ImmCheck0>, ImmCheck<2, ImmCheck0_7>]>; + def SVLUTI4_LANE_ZT_X2 : Inst<"svluti4_lane_zt[_{d}]_x2", "2.didi", "UcUsUi", "aarch64_sme_luti4_lane_zt_x2", [IsStreaming, IsSharedZA, IsPreservedZA], [ImmCheck<0, ImmCheck0>, ImmCheck<2, ImmCheck0_3>]>; +} + +// +// lookup table expand four contiguous registers +// + +let TargetGuard = "sme2" in { + def SVLUTI2_LANE_ZT_X4 : Inst<"svluti2_lane_zt[_{d}]_x4", "4.didi", "UcUsUi", "aarch64_sme_luti2_lane_zt_x4", [IsStreaming, IsSharedZA, IsPreservedZA], [ImmCheck<0, ImmCheck0>, ImmCheck<2, ImmCheck0_3>]>; + def SVLUTI4_LANE_ZT_X4 : Inst<"svluti4_lane_zt[_{d}]_x4", "4.didi", "UsUi", "aarch64_sme_luti4_lane_zt_x4", [IsStreaming, IsSharedZA, IsPreservedZA], [ImmCheck<0, ImmCheck0>, ImmCheck<2, ImmCheck0_1>]>; +} diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -13,227 +13,7 @@ // //===----------------------------------------------------------------------===// -//===----------------------------------------------------------------------===// -// Instruction definitions -//===----------------------------------------------------------------------===// -// Every intrinsic subclasses "Inst". An intrinsic has a name, a prototype and -// a sequence of typespecs. -// -// The name is the base name of the intrinsic, for example "svld1". This is -// then mangled by the tblgen backend to add type information ("svld1_s16"). -// -// A typespec is a sequence of uppercase characters (modifiers) followed by one -// lowercase character. A typespec encodes a particular "base type" of the -// intrinsic. -// -// An example typespec is "Us" - unsigned short - svuint16_t. The available -// typespec codes are given below. -// -// The string given to an Inst class is a sequence of typespecs. The intrinsic -// is instantiated for every typespec in the sequence. For example "sdUsUd". -// -// The prototype is a string that defines the return type of the intrinsic -// and the type of each argument. The return type and every argument gets a -// "modifier" that can change in some way the "base type" of the intrinsic. -// -// The modifier 'd' means "default" and does not modify the base type in any -// way. The available modifiers are given below. -// -// Typespecs -// --------- -// c: char -// s: short -// i: int -// l: long -// f: float -// h: half-float -// d: double -// b: bfloat - -// Typespec modifiers -// ------------------ -// P: boolean -// U: unsigned - -// Prototype modifiers -// ------------------- -// prototype: return (arg, arg, ...) -// -// 2,3,4: array of default vectors -// v: void -// x: vector of signed integers -// u: vector of unsigned integers -// d: default -// c: const pointer type -// P: predicate type -// s: scalar of element type -// a: scalar of element type (splat to vector type) -// R: scalar of 1/2 width element type (splat to vector type) -// r: scalar of 1/4 width element type (splat to vector type) -// @: unsigned scalar of 1/4 width element type (splat to vector type) -// e: 1/2 width unsigned elements, 2x element count -// b: 1/4 width unsigned elements, 4x element count -// h: 1/2 width elements, 2x element count -// q: 1/4 width elements, 4x element count -// o: 4x width elements, 1/4 element count -// -// w: vector of element type promoted to 64bits, vector maintains -// signedness of its element type. -// f: element type promoted to uint64_t (splat to vector type) -// j: element type promoted to 64bits (splat to vector type) -// K: element type bitcast to a signed integer (splat to vector type) -// L: element type bitcast to an unsigned integer (splat to vector type) -// -// i: constant uint64_t -// k: int32_t -// l: int64_t -// m: uint32_t -// n: uint64_t - -// t: svint32_t -// z: svuint32_t -// g: svuint64_t -// O: svfloat16_t -// M: svfloat32_t -// N: svfloat64_t - -// J: Prefetch type (sv_prfop) -// A: pointer to int8_t -// B: pointer to int16_t -// C: pointer to int32_t -// D: pointer to int64_t - -// E: pointer to uint8_t -// F: pointer to uint16_t -// G: pointer to uint32_t -// H: pointer to uint64_t - -// Q: const pointer to void - -// S: const pointer to int8_t -// T: const pointer to int16_t -// U: const pointer to int32_t -// V: const pointer to int64_t -// -// W: const pointer to uint8_t -// X: const pointer to uint16_t -// Y: const pointer to uint32_t -// Z: const pointer to uint64_t - -class MergeType { - int Value = val; - string Suffix = suffix; -} -def MergeNone : MergeType<0>; -def MergeAny : MergeType<1, "_x">; -def MergeOp1 : MergeType<2, "_m">; -def MergeZero : MergeType<3, "_z">; -def MergeAnyExp : MergeType<4, "_x">; // Use merged builtin with explicit -def MergeZeroExp : MergeType<5, "_z">; // generation of its inactive argument. - -class EltType { - int Value = val; -} -def EltTyInvalid : EltType<0>; -def EltTyInt8 : EltType<1>; -def EltTyInt16 : EltType<2>; -def EltTyInt32 : EltType<3>; -def EltTyInt64 : EltType<4>; -def EltTyFloat16 : EltType<5>; -def EltTyFloat32 : EltType<6>; -def EltTyFloat64 : EltType<7>; -def EltTyBool8 : EltType<8>; -def EltTyBool16 : EltType<9>; -def EltTyBool32 : EltType<10>; -def EltTyBool64 : EltType<11>; -def EltTyBFloat16 : EltType<12>; - -class MemEltType { - int Value = val; -} -def MemEltTyDefault : MemEltType<0>; -def MemEltTyInt8 : MemEltType<1>; -def MemEltTyInt16 : MemEltType<2>; -def MemEltTyInt32 : MemEltType<3>; -def MemEltTyInt64 : MemEltType<4>; - -class FlagType { - int Value = val; -} - -// These must be kept in sync with the flags in utils/TableGen/SveEmitter.h -// and include/clang/Basic/TargetBuiltins.h -def NoFlags : FlagType<0x00000000>; -def FirstEltType : FlagType<0x00000001>; -// : : -// : : -def EltTypeMask : FlagType<0x0000000f>; -def FirstMemEltType : FlagType<0x00000010>; -// : : -// : : -def MemEltTypeMask : FlagType<0x00000070>; -def FirstMergeTypeMask : FlagType<0x00000080>; -// : : -// : : -def MergeTypeMask : FlagType<0x00000380>; -def FirstSplatOperand : FlagType<0x00000400>; -// : : -// These flags are used to specify which scalar operand -// needs to be duplicated/splatted into a vector. -// : : -def SplatOperandMask : FlagType<0x00001C00>; -def IsLoad : FlagType<0x00002000>; -def IsStore : FlagType<0x00004000>; -def IsGatherLoad : FlagType<0x00008000>; -def IsScatterStore : FlagType<0x00010000>; -def IsStructLoad : FlagType<0x00020000>; -def IsStructStore : FlagType<0x00040000>; -def IsZExtReturn : FlagType<0x00080000>; // Return value is sign-extend by default -def IsOverloadNone : FlagType<0x00100000>; // Intrinsic does not take any overloaded types. -def IsOverloadWhile : FlagType<0x00200000>; // Use {default type, typeof(operand1)} as overloaded types. -def IsOverloadWhileRW : FlagType<0x00400000>; // Use {pred(default type), typeof(operand0)} as overloaded types. -def IsOverloadCvt : FlagType<0x00800000>; // Use {typeof(operand0), typeof(last operand)} as overloaded types. -def OverloadKindMask : FlagType<0x00E00000>; // When the masked values are all '0', the default type is used as overload type. -def IsByteIndexed : FlagType<0x01000000>; -def IsAppendSVALL : FlagType<0x02000000>; // Appends SV_ALL as the last operand. -def IsInsertOp1SVALL : FlagType<0x04000000>; // Inserts SV_ALL as the second operand. -def IsPrefetch : FlagType<0x08000000>; // Contiguous prefetches. -def IsGatherPrefetch : FlagType<0x10000000>; -def ReverseCompare : FlagType<0x20000000>; // Compare operands must be swapped. -def ReverseUSDOT : FlagType<0x40000000>; // Unsigned/signed operands must be swapped. -def IsUndef : FlagType<0x80000000>; // Codegen `undef` of given type. -def IsTupleCreate : FlagType<0x100000000>; -def IsTupleGet : FlagType<0x200000000>; -def IsTupleSet : FlagType<0x400000000>; -def ReverseMergeAnyBinOp : FlagType<0x800000000>; // e.g. Implement SUBR_X using SUB_X. -def ReverseMergeAnyAccOp : FlagType<0x1000000000>; // e.g. Implement MSB_X using MLS_X. - -// These must be kept in sync with the flags in include/clang/Basic/TargetBuiltins.h -class ImmCheckType { - int Value = val; -} -def ImmCheck0_31 : ImmCheckType<0>; // 0..31 (used for e.g. predicate patterns) -def ImmCheck1_16 : ImmCheckType<1>; // 1..16 -def ImmCheckExtract : ImmCheckType<2>; // 0..(2048/sizeinbits(elt) - 1) -def ImmCheckShiftRight : ImmCheckType<3>; // 1..sizeinbits(elt) -def ImmCheckShiftRightNarrow : ImmCheckType<4>; // 1..sizeinbits(elt)/2 -def ImmCheckShiftLeft : ImmCheckType<5>; // 0..(sizeinbits(elt) - 1) -def ImmCheck0_7 : ImmCheckType<6>; // 0..7 -def ImmCheckLaneIndex : ImmCheckType<7>; // 0..(128/(1*sizeinbits(elt)) - 1) -def ImmCheckLaneIndexCompRotate : ImmCheckType<8>; // 0..(128/(2*sizeinbits(elt)) - 1) -def ImmCheckLaneIndexDot : ImmCheckType<9>; // 0..(128/(4*sizeinbits(elt)) - 1) -def ImmCheckComplexRot90_270 : ImmCheckType<10>; // [90,270] -def ImmCheckComplexRotAll90 : ImmCheckType<11>; // [0, 90, 180,270] -def ImmCheck0_13 : ImmCheckType<12>; // 0..13 -def ImmCheck0_1 : ImmCheckType<13>; // 0..1 -def ImmCheck0_2 : ImmCheckType<14>; // 0..2 -def ImmCheck0_3 : ImmCheckType<15>; // 0..3 - -class ImmCheck { - int Arg = arg; - int EltSizeArg = eltSizeArg; - ImmCheckType Kind = kind; -} +include "arm_sve_common.td" class Inst ft, list ch, MemEltType met> { @@ -265,27 +45,27 @@ // Loads // Load one vector (scalar base) -def SVLD1 : MInst<"svld1[_{2}]", "dPc", "csilUcUsUiUlhfd", [IsLoad], MemEltTyDefault, "aarch64_sve_ld1">; -def SVLD1SB : MInst<"svld1sb_{d}", "dPS", "silUsUiUl", [IsLoad], MemEltTyInt8, "aarch64_sve_ld1">; -def SVLD1UB : MInst<"svld1ub_{d}", "dPW", "silUsUiUl", [IsLoad, IsZExtReturn], MemEltTyInt8, "aarch64_sve_ld1">; -def SVLD1SH : MInst<"svld1sh_{d}", "dPT", "ilUiUl", [IsLoad], MemEltTyInt16, "aarch64_sve_ld1">; -def SVLD1UH : MInst<"svld1uh_{d}", "dPX", "ilUiUl", [IsLoad, IsZExtReturn], MemEltTyInt16, "aarch64_sve_ld1">; -def SVLD1SW : MInst<"svld1sw_{d}", "dPU", "lUl", [IsLoad], MemEltTyInt32, "aarch64_sve_ld1">; -def SVLD1UW : MInst<"svld1uw_{d}", "dPY", "lUl", [IsLoad, IsZExtReturn], MemEltTyInt32, "aarch64_sve_ld1">; +def SVLD1 : MInst<"svld1[_{2}]", "dPc", "csilUcUsUiUlhfd", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ld1">; +def SVLD1SB : MInst<"svld1sb_{d}", "dPS", "silUsUiUl", [IsLoad, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_ld1">; +def SVLD1UB : MInst<"svld1ub_{d}", "dPW", "silUsUiUl", [IsLoad, IsZExtReturn, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_ld1">; +def SVLD1SH : MInst<"svld1sh_{d}", "dPT", "ilUiUl", [IsLoad, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_ld1">; +def SVLD1UH : MInst<"svld1uh_{d}", "dPX", "ilUiUl", [IsLoad, IsZExtReturn, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_ld1">; +def SVLD1SW : MInst<"svld1sw_{d}", "dPU", "lUl", [IsLoad, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_ld1">; +def SVLD1UW : MInst<"svld1uw_{d}", "dPY", "lUl", [IsLoad, IsZExtReturn, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_ld1">; let TargetGuard = "sve,bf16" in { - def SVLD1_BF : MInst<"svld1[_{2}]", "dPc", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ld1">; - def SVLD1_VNUM_BF : MInst<"svld1_vnum[_{2}]", "dPcl", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ld1">; + def SVLD1_BF : MInst<"svld1[_{2}]", "dPc", "b", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ld1">; + def SVLD1_VNUM_BF : MInst<"svld1_vnum[_{2}]", "dPcl", "b", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ld1">; } // Load one vector (scalar base, VL displacement) -def SVLD1_VNUM : MInst<"svld1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfd", [IsLoad], MemEltTyDefault, "aarch64_sve_ld1">; -def SVLD1SB_VNUM : MInst<"svld1sb_vnum_{d}", "dPSl", "silUsUiUl", [IsLoad], MemEltTyInt8, "aarch64_sve_ld1">; -def SVLD1UB_VNUM : MInst<"svld1ub_vnum_{d}", "dPWl", "silUsUiUl", [IsLoad, IsZExtReturn], MemEltTyInt8, "aarch64_sve_ld1">; -def SVLD1SH_VNUM : MInst<"svld1sh_vnum_{d}", "dPTl", "ilUiUl", [IsLoad], MemEltTyInt16, "aarch64_sve_ld1">; -def SVLD1UH_VNUM : MInst<"svld1uh_vnum_{d}", "dPXl", "ilUiUl", [IsLoad, IsZExtReturn], MemEltTyInt16, "aarch64_sve_ld1">; -def SVLD1SW_VNUM : MInst<"svld1sw_vnum_{d}", "dPUl", "lUl", [IsLoad], MemEltTyInt32, "aarch64_sve_ld1">; -def SVLD1UW_VNUM : MInst<"svld1uw_vnum_{d}", "dPYl", "lUl", [IsLoad, IsZExtReturn], MemEltTyInt32, "aarch64_sve_ld1">; +def SVLD1_VNUM : MInst<"svld1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfd", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ld1">; +def SVLD1SB_VNUM : MInst<"svld1sb_vnum_{d}", "dPSl", "silUsUiUl", [IsLoad, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_ld1">; +def SVLD1UB_VNUM : MInst<"svld1ub_vnum_{d}", "dPWl", "silUsUiUl", [IsLoad, IsZExtReturn, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_ld1">; +def SVLD1SH_VNUM : MInst<"svld1sh_vnum_{d}", "dPTl", "ilUiUl", [IsLoad, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_ld1">; +def SVLD1UH_VNUM : MInst<"svld1uh_vnum_{d}", "dPXl", "ilUiUl", [IsLoad, IsZExtReturn, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_ld1">; +def SVLD1SW_VNUM : MInst<"svld1sw_vnum_{d}", "dPUl", "lUl", [IsLoad, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_ld1">; +def SVLD1UW_VNUM : MInst<"svld1uw_vnum_{d}", "dPYl", "lUl", [IsLoad, IsZExtReturn, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_ld1">; // Load one vector (vector base) def SVLD1_GATHER_BASES_U : MInst<"svld1_gather[_{2}base]_{d}", "dPu", "ilUiUlfd", [IsGatherLoad], MemEltTyDefault, "aarch64_sve_ld1_gather_scalar_offset">; @@ -489,25 +269,25 @@ } // Load one vector, unextended load, non-temporal (scalar base) -def SVLDNT1 : MInst<"svldnt1[_{2}]", "dPc", "csilUcUsUiUlhfd", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnt1">; +def SVLDNT1 : MInst<"svldnt1[_{2}]", "dPc", "csilUcUsUiUlhfd", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ldnt1">; // Load one vector, unextended load, non-temporal (scalar base, VL displacement) -def SVLDNT1_VNUM : MInst<"svldnt1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfd", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnt1">; +def SVLDNT1_VNUM : MInst<"svldnt1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfd", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ldnt1">; let TargetGuard = "sve,bf16" in { - def SVLDNT1_BF : MInst<"svldnt1[_{2}]", "dPc", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnt1">; - def SVLDNT1_VNUM_BF : MInst<"svldnt1_vnum[_{2}]", "dPcl", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnt1">; + def SVLDNT1_BF : MInst<"svldnt1[_{2}]", "dPc", "b", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ldnt1">; + def SVLDNT1_VNUM_BF : MInst<"svldnt1_vnum[_{2}]", "dPcl", "b", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ldnt1">; } // Load one quadword and replicate (scalar base) -def SVLD1RQ : SInst<"svld1rq[_{2}]", "dPc", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ld1rq">; +def SVLD1RQ : SInst<"svld1rq[_{2}]", "dPc", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ld1rq", [IsStreamingCompatible]>; let TargetGuard = "sve,bf16" in { - def SVLD1RQ_BF : SInst<"svld1rq[_{2}]", "dPc", "b", MergeNone, "aarch64_sve_ld1rq">; + def SVLD1RQ_BF : SInst<"svld1rq[_{2}]", "dPc", "b", MergeNone, "aarch64_sve_ld1rq", [IsStreamingCompatible]>; } multiclass StructLoad { - def : SInst; + def : SInst; let TargetGuard = "sve,bf16" in { def: SInst; } @@ -532,42 +312,42 @@ } let TargetGuard = "sve,bf16" in { - def SVBFDOT : SInst<"svbfdot[_{0}]", "MMdd", "b", MergeNone, "aarch64_sve_bfdot", [IsOverloadNone]>; - def SVBFMLALB : SInst<"svbfmlalb[_{0}]", "MMdd", "b", MergeNone, "aarch64_sve_bfmlalb", [IsOverloadNone]>; - def SVBFMLALT : SInst<"svbfmlalt[_{0}]", "MMdd", "b", MergeNone, "aarch64_sve_bfmlalt", [IsOverloadNone]>; + def SVBFDOT : SInst<"svbfdot[_{0}]", "MMdd", "b", MergeNone, "aarch64_sve_bfdot", [IsOverloadNone, IsStreamingCompatible]>; + def SVBFMLALB : SInst<"svbfmlalb[_{0}]", "MMdd", "b", MergeNone, "aarch64_sve_bfmlalb", [IsOverloadNone, IsStreamingCompatible]>; + def SVBFMLALT : SInst<"svbfmlalt[_{0}]", "MMdd", "b", MergeNone, "aarch64_sve_bfmlalt", [IsOverloadNone, IsStreamingCompatible]>; def SVBFMMLA : SInst<"svbfmmla[_{0}]", "MMdd", "b", MergeNone, "aarch64_sve_bfmmla", [IsOverloadNone]>; - def SVBFDOT_N : SInst<"svbfdot[_n_{0}]", "MMda", "b", MergeNone, "aarch64_sve_bfdot", [IsOverloadNone]>; - def SVBFMLAL_N : SInst<"svbfmlalb[_n_{0}]", "MMda", "b", MergeNone, "aarch64_sve_bfmlalb", [IsOverloadNone]>; - def SVBFMLALT_N : SInst<"svbfmlalt[_n_{0}]", "MMda", "b", MergeNone, "aarch64_sve_bfmlalt", [IsOverloadNone]>; - def SVBFDOT_LANE : SInst<"svbfdot_lane[_{0}]", "MMddi", "b", MergeNone, "aarch64_sve_bfdot_lane_v2", [IsOverloadNone], [ImmCheck<3, ImmCheck0_3>]>; - def SVBFMLALB_LANE : SInst<"svbfmlalb_lane[_{0}]", "MMddi", "b", MergeNone, "aarch64_sve_bfmlalb_lane_v2", [IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>; - def SVBFMLALT_LANE : SInst<"svbfmlalt_lane[_{0}]", "MMddi", "b", MergeNone, "aarch64_sve_bfmlalt_lane_v2", [IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>; + def SVBFDOT_N : SInst<"svbfdot[_n_{0}]", "MMda", "b", MergeNone, "aarch64_sve_bfdot", [IsOverloadNone, IsStreamingCompatible]>; + def SVBFMLAL_N : SInst<"svbfmlalb[_n_{0}]", "MMda", "b", MergeNone, "aarch64_sve_bfmlalb", [IsOverloadNone, IsStreamingCompatible]>; + def SVBFMLALT_N : SInst<"svbfmlalt[_n_{0}]", "MMda", "b", MergeNone, "aarch64_sve_bfmlalt", [IsOverloadNone, IsStreamingCompatible]>; + def SVBFDOT_LANE : SInst<"svbfdot_lane[_{0}]", "MMddi", "b", MergeNone, "aarch64_sve_bfdot_lane_v2", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<3, ImmCheck0_3>]>; + def SVBFMLALB_LANE : SInst<"svbfmlalb_lane[_{0}]", "MMddi", "b", MergeNone, "aarch64_sve_bfmlalb_lane_v2", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<3, ImmCheck0_7>]>; + def SVBFMLALT_LANE : SInst<"svbfmlalt_lane[_{0}]", "MMddi", "b", MergeNone, "aarch64_sve_bfmlalt_lane_v2", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<3, ImmCheck0_7>]>; } //////////////////////////////////////////////////////////////////////////////// // Stores // Store one vector (scalar base) -def SVST1 : MInst<"svst1[_{d}]", "vPpd", "csilUcUsUiUlhfd", [IsStore], MemEltTyDefault, "aarch64_sve_st1">; -def SVST1B_S : MInst<"svst1b[_{d}]", "vPAd", "sil", [IsStore], MemEltTyInt8, "aarch64_sve_st1">; -def SVST1B_U : MInst<"svst1b[_{d}]", "vPEd", "UsUiUl", [IsStore], MemEltTyInt8, "aarch64_sve_st1">; -def SVST1H_S : MInst<"svst1h[_{d}]", "vPBd", "il", [IsStore], MemEltTyInt16, "aarch64_sve_st1">; -def SVST1H_U : MInst<"svst1h[_{d}]", "vPFd", "UiUl", [IsStore], MemEltTyInt16, "aarch64_sve_st1">; -def SVST1W_S : MInst<"svst1w[_{d}]", "vPCd", "l", [IsStore], MemEltTyInt32, "aarch64_sve_st1">; -def SVST1W_U : MInst<"svst1w[_{d}]", "vPGd", "Ul", [IsStore], MemEltTyInt32, "aarch64_sve_st1">; +def SVST1 : MInst<"svst1[_{d}]", "vPpd", "csilUcUsUiUlhfd", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_st1">; +def SVST1B_S : MInst<"svst1b[_{d}]", "vPAd", "sil", [IsStore, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_st1">; +def SVST1B_U : MInst<"svst1b[_{d}]", "vPEd", "UsUiUl", [IsStore, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_st1">; +def SVST1H_S : MInst<"svst1h[_{d}]", "vPBd", "il", [IsStore, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_st1">; +def SVST1H_U : MInst<"svst1h[_{d}]", "vPFd", "UiUl", [IsStore, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_st1">; +def SVST1W_S : MInst<"svst1w[_{d}]", "vPCd", "l", [IsStore, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_st1">; +def SVST1W_U : MInst<"svst1w[_{d}]", "vPGd", "Ul", [IsStore, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_st1">; // Store one vector (scalar base, VL displacement) -def SVST1_VNUM : MInst<"svst1_vnum[_{d}]", "vPpld", "csilUcUsUiUlhfd", [IsStore], MemEltTyDefault, "aarch64_sve_st1">; -def SVST1B_VNUM_S : MInst<"svst1b_vnum[_{d}]", "vPAld", "sil", [IsStore], MemEltTyInt8, "aarch64_sve_st1">; -def SVST1B_VNUM_U : MInst<"svst1b_vnum[_{d}]", "vPEld", "UsUiUl", [IsStore], MemEltTyInt8, "aarch64_sve_st1">; -def SVST1H_VNUM_S : MInst<"svst1h_vnum[_{d}]", "vPBld", "il", [IsStore], MemEltTyInt16, "aarch64_sve_st1">; -def SVST1H_VNUM_U : MInst<"svst1h_vnum[_{d}]", "vPFld", "UiUl", [IsStore], MemEltTyInt16, "aarch64_sve_st1">; -def SVST1W_VNUM_S : MInst<"svst1w_vnum[_{d}]", "vPCld", "l", [IsStore], MemEltTyInt32, "aarch64_sve_st1">; -def SVST1W_VNUM_U : MInst<"svst1w_vnum[_{d}]", "vPGld", "Ul", [IsStore], MemEltTyInt32, "aarch64_sve_st1">; +def SVST1_VNUM : MInst<"svst1_vnum[_{d}]", "vPpld", "csilUcUsUiUlhfd", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_st1">; +def SVST1B_VNUM_S : MInst<"svst1b_vnum[_{d}]", "vPAld", "sil", [IsStore, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_st1">; +def SVST1B_VNUM_U : MInst<"svst1b_vnum[_{d}]", "vPEld", "UsUiUl", [IsStore, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_st1">; +def SVST1H_VNUM_S : MInst<"svst1h_vnum[_{d}]", "vPBld", "il", [IsStore, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_st1">; +def SVST1H_VNUM_U : MInst<"svst1h_vnum[_{d}]", "vPFld", "UiUl", [IsStore, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_st1">; +def SVST1W_VNUM_S : MInst<"svst1w_vnum[_{d}]", "vPCld", "l", [IsStore, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_st1">; +def SVST1W_VNUM_U : MInst<"svst1w_vnum[_{d}]", "vPGld", "Ul", [IsStore, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_st1">; let TargetGuard = "sve,bf16" in { - def SVST1_BF : MInst<"svst1[_{d}]", "vPpd", "b", [IsStore], MemEltTyDefault, "aarch64_sve_st1">; - def SVST1_VNUM_BF : MInst<"svst1_vnum[_{d}]", "vPpld", "b", [IsStore], MemEltTyDefault, "aarch64_sve_st1">; + def SVST1_BF : MInst<"svst1[_{d}]", "vPpd", "b", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_st1">; + def SVST1_VNUM_BF : MInst<"svst1_vnum[_{d}]", "vPpld", "b", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_st1">; } // Store one vector (vector base) @@ -640,7 +420,7 @@ def SVST1W_SCATTER_INDEX_S : MInst<"svst1w_scatter[_{2}base]_index[_{d}]", "vPuld", "lUl", [IsScatterStore], MemEltTyInt32, "aarch64_sve_st1_scatter_scalar_offset">; multiclass StructStore { - def : SInst; + def : SInst; let TargetGuard = "sve,bf16" in { def: SInst; } @@ -656,30 +436,30 @@ defm SVST4_VNUM : StructStore<"svst4_vnum[_{d}]", "vPpl4", "aarch64_sve_st4">; // Store one vector, with no truncation, non-temporal (scalar base) -def SVSTNT1 : MInst<"svstnt1[_{d}]", "vPpd", "csilUcUsUiUlhfd", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">; +def SVSTNT1 : MInst<"svstnt1[_{d}]", "vPpd", "csilUcUsUiUlhfd", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_stnt1">; // Store one vector, with no truncation, non-temporal (scalar base, VL displacement) -def SVSTNT1_VNUM : MInst<"svstnt1_vnum[_{d}]", "vPpld", "csilUcUsUiUlhfd", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">; +def SVSTNT1_VNUM : MInst<"svstnt1_vnum[_{d}]", "vPpld", "csilUcUsUiUlhfd", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_stnt1">; let TargetGuard = "sve,bf16" in { - def SVSTNT1_BF : MInst<"svstnt1[_{d}]", "vPpd", "b", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">; - def SVSTNT1_VNUM_BF : MInst<"svstnt1_vnum[_{d}]", "vPpld", "b", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">; + def SVSTNT1_BF : MInst<"svstnt1[_{d}]", "vPpd", "b", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_stnt1">; + def SVSTNT1_VNUM_BF : MInst<"svstnt1_vnum[_{d}]", "vPpld", "b", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_stnt1">; } //////////////////////////////////////////////////////////////////////////////// // Prefetches // Prefetch (Scalar base) -def SVPRFB : MInst<"svprfb", "vPQJ", "c", [IsPrefetch], MemEltTyInt8, "aarch64_sve_prf">; -def SVPRFH : MInst<"svprfh", "vPQJ", "s", [IsPrefetch], MemEltTyInt16, "aarch64_sve_prf">; -def SVPRFW : MInst<"svprfw", "vPQJ", "i", [IsPrefetch], MemEltTyInt32, "aarch64_sve_prf">; -def SVPRFD : MInst<"svprfd", "vPQJ", "l", [IsPrefetch], MemEltTyInt64, "aarch64_sve_prf">; +def SVPRFB : MInst<"svprfb", "vPQJ", "c", [IsPrefetch, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_prf">; +def SVPRFH : MInst<"svprfh", "vPQJ", "s", [IsPrefetch, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_prf">; +def SVPRFW : MInst<"svprfw", "vPQJ", "i", [IsPrefetch, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_prf">; +def SVPRFD : MInst<"svprfd", "vPQJ", "l", [IsPrefetch, IsStreamingCompatible], MemEltTyInt64, "aarch64_sve_prf">; // Prefetch (Scalar base, VL displacement) -def SVPRFB_VNUM : MInst<"svprfb_vnum", "vPQlJ", "c", [IsPrefetch], MemEltTyInt8, "aarch64_sve_prf">; -def SVPRFH_VNUM : MInst<"svprfh_vnum", "vPQlJ", "s", [IsPrefetch], MemEltTyInt16, "aarch64_sve_prf">; -def SVPRFW_VNUM : MInst<"svprfw_vnum", "vPQlJ", "i", [IsPrefetch], MemEltTyInt32, "aarch64_sve_prf">; -def SVPRFD_VNUM : MInst<"svprfd_vnum", "vPQlJ", "l", [IsPrefetch], MemEltTyInt64, "aarch64_sve_prf">; +def SVPRFB_VNUM : MInst<"svprfb_vnum", "vPQlJ", "c", [IsPrefetch, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_prf">; +def SVPRFH_VNUM : MInst<"svprfh_vnum", "vPQlJ", "s", [IsPrefetch, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_prf">; +def SVPRFW_VNUM : MInst<"svprfw_vnum", "vPQlJ", "i", [IsPrefetch, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_prf">; +def SVPRFD_VNUM : MInst<"svprfd_vnum", "vPQlJ", "l", [IsPrefetch, IsStreamingCompatible], MemEltTyInt64, "aarch64_sve_prf">; // Prefetch (Vector bases) def SVPRFB_GATHER_BASES : MInst<"svprfb_gather[_{2}base]", "vPdJ", "UiUl", [IsGatherPrefetch], MemEltTyInt8, "aarch64_sve_prfb_gather_scalar_offset">; @@ -725,16 +505,16 @@ //////////////////////////////////////////////////////////////////////////////// // Scalar to vector -def SVDUPQ_8 : SInst<"svdupq[_n]_{d}", "dssssssssssssssss", "cUc", MergeNone>; -def SVDUPQ_16 : SInst<"svdupq[_n]_{d}", "dssssssss", "sUsh", MergeNone>; +def SVDUPQ_8 : SInst<"svdupq[_n]_{d}", "dssssssssssssssss", "cUc", MergeNone, "", [IsStreamingCompatible]>; +def SVDUPQ_16 : SInst<"svdupq[_n]_{d}", "dssssssss", "sUsh", MergeNone, "", [IsStreamingCompatible]>; let TargetGuard = "sve,bf16" in { - def SVDUPQ_BF16 : SInst<"svdupq[_n]_{d}", "dssssssss", "b", MergeNone>; + def SVDUPQ_BF16 : SInst<"svdupq[_n]_{d}", "dssssssss", "b", MergeNone, "", [IsStreamingCompatible]>; } -def SVDUPQ_32 : SInst<"svdupq[_n]_{d}", "dssss", "iUif", MergeNone>; -def SVDUPQ_64 : SInst<"svdupq[_n]_{d}", "dss", "lUld", MergeNone>; +def SVDUPQ_32 : SInst<"svdupq[_n]_{d}", "dssss", "iUif", MergeNone, "", [IsStreamingCompatible]>; +def SVDUPQ_64 : SInst<"svdupq[_n]_{d}", "dss", "lUld", MergeNone, "", [IsStreamingCompatible]>; multiclass svdup_base { - def NAME : SInst; + def NAME : SInst; let TargetGuard = "sve,bf16" in { def _BF16: SInst; } @@ -745,14 +525,14 @@ defm SVDUP_X : svdup_base<"svdup[_n]_{d}", "dPs", MergeAnyExp, "aarch64_sve_dup">; defm SVDUP_Z : svdup_base<"svdup[_n]_{d}", "dPs", MergeZeroExp, "aarch64_sve_dup">; -def SVINDEX : SInst<"svindex_{d}", "dss", "csilUcUsUiUl", MergeNone, "aarch64_sve_index">; +def SVINDEX : SInst<"svindex_{d}", "dss", "csilUcUsUiUl", MergeNone, "aarch64_sve_index", [IsStreamingCompatible]>; // Integer arithmetic -multiclass SInstZPZ flags=[]> { - def _M : SInst; - def _X : SInst; - def _Z : SInst; +multiclass SInstZPZ { + def _M : SInst; + def _X : SInst; + def _Z : SInst; } defm SVABS : SInstZPZ<"svabs", "csil", "aarch64_sve_abs">; @@ -761,13 +541,13 @@ //------------------------------------------------------------------------------ multiclass SInstZPZZ flags=[]> { - def _M : SInst; - def _X : SInst; - def _Z : SInst; + def _M : SInst; + def _X : SInst; + def _Z : SInst; - def _N_M : SInst; - def _N_X : SInst; - def _N_Z : SInst; + def _N_M : SInst; + def _N_X : SInst; + def _N_Z : SInst; } defm SVABD_S : SInstZPZZ<"svabd", "csil", "aarch64_sve_sabd", "aarch64_sve_sabd_u">; @@ -787,6 +567,12 @@ defm SVSUB : SInstZPZZ<"svsub", "csilUcUsUiUl", "aarch64_sve_sub", "aarch64_sve_sub_u">; defm SVSUBR : SInstZPZZ<"svsubr", "csilUcUsUiUl", "aarch64_sve_subr", "aarch64_sve_sub_u", [ReverseMergeAnyBinOp]>; +let TargetGuard = "sve2p1,b16b16" in { + defm SVMUL_BF : SInstZPZZ<"svmul", "b", "aarch64_sve_fmul", "aarch64_sve_fmul_u">; + defm SVADD_BF : SInstZPZZ<"svadd", "b", "aarch64_sve_fadd", "aarch64_sve_fadd_u">; + defm SVSUB_BF : SInstZPZZ<"svsub", "b", "aarch64_sve_fsub", "aarch64_sve_fsub_u">; +} + //------------------------------------------------------------------------------ multiclass SInstZPZZZ flags=[]> { @@ -799,29 +585,29 @@ def _N_Z : SInst; } -defm SVMAD : SInstZPZZZ<"svmad", "csilUcUsUiUl", "aarch64_sve_mad", "aarch64_sve_mad">; -defm SVMLA : SInstZPZZZ<"svmla", "csilUcUsUiUl", "aarch64_sve_mla", "aarch64_sve_mla">; -defm SVMLS : SInstZPZZZ<"svmls", "csilUcUsUiUl", "aarch64_sve_mls", "aarch64_sve_mls">; -defm SVMSB : SInstZPZZZ<"svmsb", "csilUcUsUiUl", "aarch64_sve_msb", "aarch64_sve_msb">; +defm SVMAD : SInstZPZZZ<"svmad", "csilUcUsUiUl", "aarch64_sve_mad", "aarch64_sve_mad", [IsStreamingCompatible]>; +defm SVMLA : SInstZPZZZ<"svmla", "csilUcUsUiUl", "aarch64_sve_mla", "aarch64_sve_mla", [IsStreamingCompatible]>; +defm SVMLS : SInstZPZZZ<"svmls", "csilUcUsUiUl", "aarch64_sve_mls", "aarch64_sve_mls", [IsStreamingCompatible]>; +defm SVMSB : SInstZPZZZ<"svmsb", "csilUcUsUiUl", "aarch64_sve_msb", "aarch64_sve_msb", [IsStreamingCompatible]>; //------------------------------------------------------------------------------ -def SVDOT_S : SInst<"svdot[_{0}]", "ddqq", "il", MergeNone, "aarch64_sve_sdot">; -def SVDOT_U : SInst<"svdot[_{0}]", "ddqq", "UiUl", MergeNone, "aarch64_sve_udot">; -def SVQADD_S : SInst<"svqadd[_{d}]", "ddd", "csil", MergeNone, "aarch64_sve_sqadd_x">; -def SVQADD_U : SInst<"svqadd[_{d}]", "ddd", "UcUsUiUl", MergeNone, "aarch64_sve_uqadd_x">; -def SVQSUB_S : SInst<"svqsub[_{d}]", "ddd", "csil", MergeNone, "aarch64_sve_sqsub_x">; -def SVQSUB_U : SInst<"svqsub[_{d}]", "ddd", "UcUsUiUl", MergeNone, "aarch64_sve_uqsub_x">; +def SVDOT_S : SInst<"svdot[_{0}]", "ddqq", "il", MergeNone, "aarch64_sve_sdot", [IsStreamingCompatible]>; +def SVDOT_U : SInst<"svdot[_{0}]", "ddqq", "UiUl", MergeNone, "aarch64_sve_udot", [IsStreamingCompatible]>; +def SVQADD_S : SInst<"svqadd[_{d}]", "ddd", "csil", MergeNone, "aarch64_sve_sqadd_x", [IsStreamingCompatible]>; +def SVQADD_U : SInst<"svqadd[_{d}]", "ddd", "UcUsUiUl", MergeNone, "aarch64_sve_uqadd_x", [IsStreamingCompatible]>; +def SVQSUB_S : SInst<"svqsub[_{d}]", "ddd", "csil", MergeNone, "aarch64_sve_sqsub_x", [IsStreamingCompatible]>; +def SVQSUB_U : SInst<"svqsub[_{d}]", "ddd", "UcUsUiUl", MergeNone, "aarch64_sve_uqsub_x", [IsStreamingCompatible]>; -def SVDOT_N_S : SInst<"svdot[_n_{0}]", "ddqr", "il", MergeNone, "aarch64_sve_sdot">; -def SVDOT_N_U : SInst<"svdot[_n_{0}]", "ddqr", "UiUl", MergeNone, "aarch64_sve_udot">; -def SVQADD_N_S : SInst<"svqadd[_n_{d}]", "dda", "csil", MergeNone, "aarch64_sve_sqadd_x">; -def SVQADD_N_U : SInst<"svqadd[_n_{d}]", "dda", "UcUsUiUl", MergeNone, "aarch64_sve_uqadd_x">; -def SVQSUB_N_S : SInst<"svqsub[_n_{d}]", "dda", "csil", MergeNone, "aarch64_sve_sqsub_x">; -def SVQSUB_N_U : SInst<"svqsub[_n_{d}]", "dda", "UcUsUiUl", MergeNone, "aarch64_sve_uqsub_x">; +def SVDOT_N_S : SInst<"svdot[_n_{0}]", "ddqr", "il", MergeNone, "aarch64_sve_sdot", [IsStreamingCompatible]>; +def SVDOT_N_U : SInst<"svdot[_n_{0}]", "ddqr", "UiUl", MergeNone, "aarch64_sve_udot", [IsStreamingCompatible]>; +def SVQADD_N_S : SInst<"svqadd[_n_{d}]", "dda", "csil", MergeNone, "aarch64_sve_sqadd_x", [IsStreamingCompatible]>; +def SVQADD_N_U : SInst<"svqadd[_n_{d}]", "dda", "UcUsUiUl", MergeNone, "aarch64_sve_uqadd_x", [IsStreamingCompatible]>; +def SVQSUB_N_S : SInst<"svqsub[_n_{d}]", "dda", "csil", MergeNone, "aarch64_sve_sqsub_x", [IsStreamingCompatible]>; +def SVQSUB_N_U : SInst<"svqsub[_n_{d}]", "dda", "UcUsUiUl", MergeNone, "aarch64_sve_uqsub_x", [IsStreamingCompatible]>; -def SVDOT_LANE_S : SInst<"svdot_lane[_{d}]", "ddqqi", "il", MergeNone, "aarch64_sve_sdot_lane", [], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>; -def SVDOT_LANE_U : SInst<"svdot_lane[_{d}]", "ddqqi", "UiUl", MergeNone, "aarch64_sve_udot_lane", [], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>; +def SVDOT_LANE_S : SInst<"svdot_lane[_{d}]", "ddqqi", "il", MergeNone, "aarch64_sve_sdot_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>; +def SVDOT_LANE_U : SInst<"svdot_lane[_{d}]", "ddqqi", "UiUl", MergeNone, "aarch64_sve_udot_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>; //////////////////////////////////////////////////////////////////////////////// // Logical operations @@ -838,107 +624,107 @@ // Shifts multiclass SInst_SHIFT { - def _M : SInst; - def _X : SInst; - def _Z : SInst; + def _M : SInst; + def _X : SInst; + def _Z : SInst; - def _N_M : SInst; - def _N_X : SInst; - def _N_Z : SInst; + def _N_M : SInst; + def _N_X : SInst; + def _N_Z : SInst; - def _WIDE_M : SInst; - def _WIDE_X : SInst; - def _WIDE_Z : SInst; + def _WIDE_M : SInst; + def _WIDE_X : SInst; + def _WIDE_Z : SInst; - def _WIDE_N_M : SInst; - def _WIDE_N_X : SInst; - def _WIDE_N_Z : SInst; + def _WIDE_N_M : SInst; + def _WIDE_N_X : SInst; + def _WIDE_N_Z : SInst; } defm SVASR : SInst_SHIFT<"svasr", "aarch64_sve_asr", "csil", "csi">; defm SVLSL : SInst_SHIFT<"svlsl", "aarch64_sve_lsl", "csilUcUsUiUl", "csiUcUsUi">; defm SVLSR : SInst_SHIFT<"svlsr", "aarch64_sve_lsr", "UcUsUiUl", "UcUsUi">; -def SVASRD_M : SInst<"svasrd[_n_{d}]", "dPdi", "csil", MergeOp1, "aarch64_sve_asrd", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVASRD_X : SInst<"svasrd[_n_{d}]", "dPdi", "csil", MergeAny, "aarch64_sve_asrd", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVASRD_Z : SInst<"svasrd[_n_{d}]", "dPdi", "csil", MergeZero, "aarch64_sve_asrd", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVASRD_M : SInst<"svasrd[_n_{d}]", "dPdi", "csil", MergeOp1, "aarch64_sve_asrd", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVASRD_X : SInst<"svasrd[_n_{d}]", "dPdi", "csil", MergeAny, "aarch64_sve_asrd", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVASRD_Z : SInst<"svasrd[_n_{d}]", "dPdi", "csil", MergeZero, "aarch64_sve_asrd", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVINSR : SInst<"svinsr[_n_{d}]", "dds", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_insr">; +def SVINSR : SInst<"svinsr[_n_{d}]", "dds", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_insr", [IsStreamingCompatible]>; let TargetGuard = "sve,bf16" in { - def SVINSR_BF16 : SInst<"svinsr[_n_{d}]", "dds", "b", MergeNone, "aarch64_sve_insr">; + def SVINSR_BF16 : SInst<"svinsr[_n_{d}]", "dds", "b", MergeNone, "aarch64_sve_insr", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// // Integer reductions -def SVADDV_S : SInst<"svaddv[_{d}]", "lPd", "csil", MergeNone, "aarch64_sve_saddv">; -def SVADDV_U : SInst<"svaddv[_{d}]", "nPd", "UcUsUiUl", MergeNone, "aarch64_sve_uaddv">; -def SVANDV : SInst<"svandv[_{d}]", "sPd", "csilUcUsUiUl", MergeNone, "aarch64_sve_andv">; -def SVEORV : SInst<"sveorv[_{d}]", "sPd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorv">; -def SVMAXV_S : SInst<"svmaxv[_{d}]", "sPd", "csil", MergeNone, "aarch64_sve_smaxv">; -def SVMAXV_U : SInst<"svmaxv[_{d}]", "sPd", "UcUsUiUl", MergeNone, "aarch64_sve_umaxv">; -def SVMINV_S : SInst<"svminv[_{d}]", "sPd", "csil", MergeNone, "aarch64_sve_sminv">; -def SVMINV_U : SInst<"svminv[_{d}]", "sPd", "UcUsUiUl", MergeNone, "aarch64_sve_uminv">; -def SVORV : SInst<"svorv[_{d}]", "sPd", "csilUcUsUiUl", MergeNone, "aarch64_sve_orv">; +def SVADDV_S : SInst<"svaddv[_{d}]", "lPd", "csil", MergeNone, "aarch64_sve_saddv", [IsStreamingCompatible]>; +def SVADDV_U : SInst<"svaddv[_{d}]", "nPd", "UcUsUiUl", MergeNone, "aarch64_sve_uaddv", [IsStreamingCompatible]>; +def SVANDV : SInst<"svandv[_{d}]", "sPd", "csilUcUsUiUl", MergeNone, "aarch64_sve_andv", [IsStreamingCompatible]>; +def SVEORV : SInst<"sveorv[_{d}]", "sPd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorv", [IsStreamingCompatible]>; +def SVMAXV_S : SInst<"svmaxv[_{d}]", "sPd", "csil", MergeNone, "aarch64_sve_smaxv", [IsStreamingCompatible]>; +def SVMAXV_U : SInst<"svmaxv[_{d}]", "sPd", "UcUsUiUl", MergeNone, "aarch64_sve_umaxv", [IsStreamingCompatible]>; +def SVMINV_S : SInst<"svminv[_{d}]", "sPd", "csil", MergeNone, "aarch64_sve_sminv", [IsStreamingCompatible]>; +def SVMINV_U : SInst<"svminv[_{d}]", "sPd", "UcUsUiUl", MergeNone, "aarch64_sve_uminv", [IsStreamingCompatible]>; +def SVORV : SInst<"svorv[_{d}]", "sPd", "csilUcUsUiUl", MergeNone, "aarch64_sve_orv", [IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // Integer comparisons -def SVCMPEQ : SInst<"svcmpeq[_{d}]", "PPdd", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpeq">; -def SVCMPNE : SInst<"svcmpne[_{d}]", "PPdd", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpne">; -def SVCMPGE : SInst<"svcmpge[_{d}]", "PPdd", "csil", MergeNone, "aarch64_sve_cmpge">; -def SVCMPGT : SInst<"svcmpgt[_{d}]", "PPdd", "csil", MergeNone, "aarch64_sve_cmpgt">; -def SVCMPLE : SInst<"svcmple[_{d}]", "PPdd", "csil", MergeNone, "aarch64_sve_cmpge", [ReverseCompare]>; -def SVCMPLT : SInst<"svcmplt[_{d}]", "PPdd", "csil", MergeNone, "aarch64_sve_cmpgt", [ReverseCompare]>; -def SVCMPHI : SInst<"svcmpgt[_{d}]", "PPdd", "UcUsUiUl", MergeNone, "aarch64_sve_cmphi">; -def SVCMPHS : SInst<"svcmpge[_{d}]", "PPdd", "UcUsUiUl", MergeNone, "aarch64_sve_cmphs">; -def SVCMPLO : SInst<"svcmplt[_{d}]", "PPdd", "UcUsUiUl", MergeNone, "aarch64_sve_cmphi", [ReverseCompare]>; -def SVCMPLS : SInst<"svcmple[_{d}]", "PPdd", "UcUsUiUl", MergeNone, "aarch64_sve_cmphs", [ReverseCompare]>; - -def SVCMPEQ_N : SInst<"svcmpeq[_n_{d}]", "PPda", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpeq">; -def SVCMPNE_N : SInst<"svcmpne[_n_{d}]", "PPda", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpne">; -def SVCMPGE_N : SInst<"svcmpge[_n_{d}]", "PPda", "csil", MergeNone, "aarch64_sve_cmpge">; -def SVCMPGT_N : SInst<"svcmpgt[_n_{d}]", "PPda", "csil", MergeNone, "aarch64_sve_cmpgt">; -def SVCMPLE_N : SInst<"svcmple[_n_{d}]", "PPda", "csil", MergeNone, "aarch64_sve_cmpge", [ReverseCompare]>; -def SVCMPLT_N : SInst<"svcmplt[_n_{d}]", "PPda", "csil", MergeNone, "aarch64_sve_cmpgt", [ReverseCompare]>; -def SVCMPHS_N : SInst<"svcmpge[_n_{d}]", "PPda", "UcUsUiUl", MergeNone, "aarch64_sve_cmphs">; -def SVCMPHI_N : SInst<"svcmpgt[_n_{d}]", "PPda", "UcUsUiUl", MergeNone, "aarch64_sve_cmphi">; -def SVCMPLS_N : SInst<"svcmple[_n_{d}]", "PPda", "UcUsUiUl", MergeNone, "aarch64_sve_cmphs", [ReverseCompare]>; -def SVCMPLO_N : SInst<"svcmplt[_n_{d}]", "PPda", "UcUsUiUl", MergeNone, "aarch64_sve_cmphi", [ReverseCompare]>; - -def SVCMPEQ_WIDE : SInst<"svcmpeq_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmpeq_wide">; -def SVCMPNE_WIDE : SInst<"svcmpne_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmpne_wide">; -def SVCMPGE_WIDE : SInst<"svcmpge_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmpge_wide">; -def SVCMPGT_WIDE : SInst<"svcmpgt_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmpgt_wide">; -def SVCMPLE_WIDE : SInst<"svcmple_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmple_wide">; -def SVCMPLT_WIDE : SInst<"svcmplt_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmplt_wide">; -def SVCMPHI_WIDE : SInst<"svcmpgt_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmphi_wide">; -def SVCMPHS_WIDE : SInst<"svcmpge_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmphs_wide">; -def SVCMPLO_WIDE : SInst<"svcmplt_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmplo_wide">; -def SVCMPLS_WIDE : SInst<"svcmple_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmpls_wide">; - -def SVCMPEQ_WIDE_N : SInst<"svcmpeq_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmpeq_wide">; -def SVCMPNE_WIDE_N : SInst<"svcmpne_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmpne_wide">; -def SVCMPGE_WIDE_N : SInst<"svcmpge_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmpge_wide">; -def SVCMPGT_WIDE_N : SInst<"svcmpgt_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmpgt_wide">; -def SVCMPLE_WIDE_N : SInst<"svcmple_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmple_wide">; -def SVCMPLT_WIDE_N : SInst<"svcmplt_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmplt_wide">; -def SVCMPHS_WIDE_N : SInst<"svcmpge_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmphs_wide">; -def SVCMPHI_WIDE_N : SInst<"svcmpgt_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmphi_wide">; -def SVCMPLO_WIDE_N : SInst<"svcmplt_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmplo_wide">; -def SVCMPLS_WIDE_N : SInst<"svcmple_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmpls_wide">; +def SVCMPEQ : SInst<"svcmpeq[_{d}]", "PPdd", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpeq", [IsStreamingCompatible]>; +def SVCMPNE : SInst<"svcmpne[_{d}]", "PPdd", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpne", [IsStreamingCompatible]>; +def SVCMPGE : SInst<"svcmpge[_{d}]", "PPdd", "csil", MergeNone, "aarch64_sve_cmpge", [IsStreamingCompatible]>; +def SVCMPGT : SInst<"svcmpgt[_{d}]", "PPdd", "csil", MergeNone, "aarch64_sve_cmpgt", [IsStreamingCompatible]>; +def SVCMPLE : SInst<"svcmple[_{d}]", "PPdd", "csil", MergeNone, "aarch64_sve_cmpge", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPLT : SInst<"svcmplt[_{d}]", "PPdd", "csil", MergeNone, "aarch64_sve_cmpgt", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPHI : SInst<"svcmpgt[_{d}]", "PPdd", "UcUsUiUl", MergeNone, "aarch64_sve_cmphi", [IsStreamingCompatible]>; +def SVCMPHS : SInst<"svcmpge[_{d}]", "PPdd", "UcUsUiUl", MergeNone, "aarch64_sve_cmphs", [IsStreamingCompatible]>; +def SVCMPLO : SInst<"svcmplt[_{d}]", "PPdd", "UcUsUiUl", MergeNone, "aarch64_sve_cmphi", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPLS : SInst<"svcmple[_{d}]", "PPdd", "UcUsUiUl", MergeNone, "aarch64_sve_cmphs", [ReverseCompare, IsStreamingCompatible]>; + +def SVCMPEQ_N : SInst<"svcmpeq[_n_{d}]", "PPda", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpeq", [IsStreamingCompatible]>; +def SVCMPNE_N : SInst<"svcmpne[_n_{d}]", "PPda", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpne", [IsStreamingCompatible]>; +def SVCMPGE_N : SInst<"svcmpge[_n_{d}]", "PPda", "csil", MergeNone, "aarch64_sve_cmpge", [IsStreamingCompatible]>; +def SVCMPGT_N : SInst<"svcmpgt[_n_{d}]", "PPda", "csil", MergeNone, "aarch64_sve_cmpgt", [IsStreamingCompatible]>; +def SVCMPLE_N : SInst<"svcmple[_n_{d}]", "PPda", "csil", MergeNone, "aarch64_sve_cmpge", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPLT_N : SInst<"svcmplt[_n_{d}]", "PPda", "csil", MergeNone, "aarch64_sve_cmpgt", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPHS_N : SInst<"svcmpge[_n_{d}]", "PPda", "UcUsUiUl", MergeNone, "aarch64_sve_cmphs", [IsStreamingCompatible]>; +def SVCMPHI_N : SInst<"svcmpgt[_n_{d}]", "PPda", "UcUsUiUl", MergeNone, "aarch64_sve_cmphi", [IsStreamingCompatible]>; +def SVCMPLS_N : SInst<"svcmple[_n_{d}]", "PPda", "UcUsUiUl", MergeNone, "aarch64_sve_cmphs", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPLO_N : SInst<"svcmplt[_n_{d}]", "PPda", "UcUsUiUl", MergeNone, "aarch64_sve_cmphi", [ReverseCompare, IsStreamingCompatible]>; + +def SVCMPEQ_WIDE : SInst<"svcmpeq_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmpeq_wide", [IsStreamingCompatible]>; +def SVCMPNE_WIDE : SInst<"svcmpne_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmpne_wide", [IsStreamingCompatible]>; +def SVCMPGE_WIDE : SInst<"svcmpge_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmpge_wide", [IsStreamingCompatible]>; +def SVCMPGT_WIDE : SInst<"svcmpgt_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmpgt_wide", [IsStreamingCompatible]>; +def SVCMPLE_WIDE : SInst<"svcmple_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmple_wide", [IsStreamingCompatible]>; +def SVCMPLT_WIDE : SInst<"svcmplt_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmplt_wide", [IsStreamingCompatible]>; +def SVCMPHI_WIDE : SInst<"svcmpgt_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmphi_wide", [IsStreamingCompatible]>; +def SVCMPHS_WIDE : SInst<"svcmpge_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmphs_wide", [IsStreamingCompatible]>; +def SVCMPLO_WIDE : SInst<"svcmplt_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmplo_wide", [IsStreamingCompatible]>; +def SVCMPLS_WIDE : SInst<"svcmple_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmpls_wide", [IsStreamingCompatible]>; + +def SVCMPEQ_WIDE_N : SInst<"svcmpeq_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmpeq_wide", [IsStreamingCompatible]>; +def SVCMPNE_WIDE_N : SInst<"svcmpne_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmpne_wide", [IsStreamingCompatible]>; +def SVCMPGE_WIDE_N : SInst<"svcmpge_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmpge_wide", [IsStreamingCompatible]>; +def SVCMPGT_WIDE_N : SInst<"svcmpgt_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmpgt_wide", [IsStreamingCompatible]>; +def SVCMPLE_WIDE_N : SInst<"svcmple_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmple_wide", [IsStreamingCompatible]>; +def SVCMPLT_WIDE_N : SInst<"svcmplt_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmplt_wide", [IsStreamingCompatible]>; +def SVCMPHS_WIDE_N : SInst<"svcmpge_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmphs_wide", [IsStreamingCompatible]>; +def SVCMPHI_WIDE_N : SInst<"svcmpgt_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmphi_wide", [IsStreamingCompatible]>; +def SVCMPLO_WIDE_N : SInst<"svcmplt_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmplo_wide", [IsStreamingCompatible]>; +def SVCMPLS_WIDE_N : SInst<"svcmple_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmpls_wide", [IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // While comparisons -def SVWHILELE_S32 : SInst<"svwhilele_{d}[_{1}]", "Pkk", "PcPsPiPl", MergeNone, "aarch64_sve_whilele", [IsOverloadWhile]>; -def SVWHILELE_S64 : SInst<"svwhilele_{d}[_{1}]", "Pll", "PcPsPiPl", MergeNone, "aarch64_sve_whilele", [IsOverloadWhile]>; -def SVWHILELO_U32 : SInst<"svwhilelt_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilelo", [IsOverloadWhile]>; -def SVWHILELO_U64 : SInst<"svwhilelt_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilelo", [IsOverloadWhile]>; -def SVWHILELS_U32 : SInst<"svwhilele_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilels", [IsOverloadWhile]>; -def SVWHILELS_U64 : SInst<"svwhilele_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilels", [IsOverloadWhile]>; -def SVWHILELT_S32 : SInst<"svwhilelt_{d}[_{1}]", "Pkk", "PcPsPiPl", MergeNone, "aarch64_sve_whilelt", [IsOverloadWhile]>; -def SVWHILELT_S64 : SInst<"svwhilelt_{d}[_{1}]", "Pll", "PcPsPiPl", MergeNone, "aarch64_sve_whilelt", [IsOverloadWhile]>; +def SVWHILELE_S32 : SInst<"svwhilele_{d}[_{1}]", "Pkk", "PcPsPiPl", MergeNone, "aarch64_sve_whilele", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILELE_S64 : SInst<"svwhilele_{d}[_{1}]", "Pll", "PcPsPiPl", MergeNone, "aarch64_sve_whilele", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILELO_U32 : SInst<"svwhilelt_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilelo", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILELO_U64 : SInst<"svwhilelt_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilelo", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILELS_U32 : SInst<"svwhilele_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilels", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILELS_U64 : SInst<"svwhilele_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilels", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILELT_S32 : SInst<"svwhilelt_{d}[_{1}]", "Pkk", "PcPsPiPl", MergeNone, "aarch64_sve_whilelt", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILELT_S64 : SInst<"svwhilelt_{d}[_{1}]", "Pll", "PcPsPiPl", MergeNone, "aarch64_sve_whilelt", [IsOverloadWhile, IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // Counting bit @@ -949,12 +735,12 @@ def _Z : SInst; } -defm SVCLS : SInstCLS<"svcls", "csil", "aarch64_sve_cls">; -defm SVCLZ : SInstCLS<"svclz", "csilUcUsUiUl", "aarch64_sve_clz">; -defm SVCNT : SInstCLS<"svcnt", "csilUcUsUiUlhfd", "aarch64_sve_cnt">; +defm SVCLS : SInstCLS<"svcls", "csil", "aarch64_sve_cls", [IsStreamingCompatible]>; +defm SVCLZ : SInstCLS<"svclz", "csilUcUsUiUl", "aarch64_sve_clz", [IsStreamingCompatible]>; +defm SVCNT : SInstCLS<"svcnt", "csilUcUsUiUlhfd", "aarch64_sve_cnt", [IsStreamingCompatible]>; let TargetGuard = "sve,bf16" in { - defm SVCNT_BF16 : SInstCLS<"svcnt", "b", "aarch64_sve_cnt">; + defm SVCNT_BF16 : SInstCLS<"svcnt", "b", "aarch64_sve_cnt", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// @@ -994,6 +780,13 @@ defm SVSUB_F : SInstZPZZ<"svsub", "hfd", "aarch64_sve_fsub", "aarch64_sve_fsub_u">; defm SVSUBR_F : SInstZPZZ<"svsubr", "hfd", "aarch64_sve_fsubr", "aarch64_sve_fsub_u", [ReverseMergeAnyBinOp]>; +let TargetGuard = "sve2p1,b16b16" in { + defm SVMAXNM_BF : SInstZPZZ<"svmaxnm","b", "aarch64_sve_fmaxnm", "aarch64_sve_fmaxnm_u">; + defm SVMINNM_BF : SInstZPZZ<"svminnm","b", "aarch64_sve_fminnm", "aarch64_sve_fminnm_u">; + defm SVMAX_BF : SInstZPZZ<"svmax", "b", "aarch64_sve_fmax", "aarch64_sve_fmax_u">; + defm SVMIN_BF : SInstZPZZ<"svmin", "b", "aarch64_sve_fmin", "aarch64_sve_fmin_u">; +} + defm SVRECPX : SInstZPZ<"svrecpx", "hfd", "aarch64_sve_frecpx">; defm SVRINTA : SInstZPZ<"svrinta", "hfd", "aarch64_sve_frinta">; defm SVRINTI : SInstZPZ<"svrinti", "hfd", "aarch64_sve_frinti">; @@ -1009,79 +802,83 @@ def SVTSMUL : SInst<"svtsmul[_{d}]", "ddu", "hfd", MergeNone, "aarch64_sve_ftsmul_x">; def SVTSSEL : SInst<"svtssel[_{d}]", "ddu", "hfd", MergeNone, "aarch64_sve_ftssel_x">; -def SVSCALE_M : SInst<"svscale[_{d}]", "dPdx", "hfd", MergeOp1, "aarch64_sve_fscale">; -def SVSCALE_X : SInst<"svscale[_{d}]", "dPdx", "hfd", MergeAny, "aarch64_sve_fscale">; -def SVSCALE_Z : SInst<"svscale[_{d}]", "dPdx", "hfd", MergeZero, "aarch64_sve_fscale">; - -def SVSCALE_N_M : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeOp1, "aarch64_sve_fscale">; -def SVSCALE_N_X : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeAny, "aarch64_sve_fscale">; -def SVSCALE_N_Z : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeZero, "aarch64_sve_fscale">; - -defm SVMAD_F : SInstZPZZZ<"svmad", "hfd", "aarch64_sve_fmad", "aarch64_sve_fmla_u", [ReverseMergeAnyAccOp]>; -defm SVMLA_F : SInstZPZZZ<"svmla", "hfd", "aarch64_sve_fmla", "aarch64_sve_fmla_u">; -defm SVMLS_F : SInstZPZZZ<"svmls", "hfd", "aarch64_sve_fmls", "aarch64_sve_fmls_u">; -defm SVMSB_F : SInstZPZZZ<"svmsb", "hfd", "aarch64_sve_fmsb", "aarch64_sve_fmls_u", [ReverseMergeAnyAccOp]>; -defm SVNMAD_F : SInstZPZZZ<"svnmad", "hfd", "aarch64_sve_fnmad", "aarch64_sve_fnmla_u", [ReverseMergeAnyAccOp]>; -defm SVNMLA_F : SInstZPZZZ<"svnmla", "hfd", "aarch64_sve_fnmla", "aarch64_sve_fnmla_u">; -defm SVNMLS_F : SInstZPZZZ<"svnmls", "hfd", "aarch64_sve_fnmls", "aarch64_sve_fnmls_u">; -defm SVNMSB_F : SInstZPZZZ<"svnmsb", "hfd", "aarch64_sve_fnmsb", "aarch64_sve_fnmls_u", [ReverseMergeAnyAccOp]>; - -def SVCADD_M : SInst<"svcadd[_{d}]", "dPddi", "hfd", MergeOp1, "aarch64_sve_fcadd", [], [ImmCheck<3, ImmCheckComplexRot90_270>]>; -def SVCADD_X : SInst<"svcadd[_{d}]", "dPddi", "hfd", MergeAny, "aarch64_sve_fcadd", [], [ImmCheck<3, ImmCheckComplexRot90_270>]>; -def SVCADD_Z : SInst<"svcadd[_{d}]", "dPddi", "hfd", MergeZero, "aarch64_sve_fcadd", [], [ImmCheck<3, ImmCheckComplexRot90_270>]>; -def SVCMLA_M : SInst<"svcmla[_{d}]", "dPdddi", "hfd", MergeOp1, "aarch64_sve_fcmla", [], [ImmCheck<4, ImmCheckComplexRotAll90>]>; -def SVCMLA_X : SInst<"svcmla[_{d}]", "dPdddi", "hfd", MergeAny, "aarch64_sve_fcmla", [], [ImmCheck<4, ImmCheckComplexRotAll90>]>; -def SVCMLA_Z : SInst<"svcmla[_{d}]", "dPdddi", "hfd", MergeZero, "aarch64_sve_fcmla", [], [ImmCheck<4, ImmCheckComplexRotAll90>]>; - -def SVCMLA_LANE : SInst<"svcmla_lane[_{d}]", "ddddii", "hf", MergeNone, "aarch64_sve_fcmla_lane", [], [ImmCheck<3, ImmCheckLaneIndexCompRotate, 2>, +def SVSCALE_M : SInst<"svscale[_{d}]", "dPdx", "hfd", MergeOp1, "aarch64_sve_fscale", [IsStreamingCompatible]>; +def SVSCALE_X : SInst<"svscale[_{d}]", "dPdx", "hfd", MergeAny, "aarch64_sve_fscale", [IsStreamingCompatible]>; +def SVSCALE_Z : SInst<"svscale[_{d}]", "dPdx", "hfd", MergeZero, "aarch64_sve_fscale", [IsStreamingCompatible]>; + +def SVSCALE_N_M : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeOp1, "aarch64_sve_fscale", [IsStreamingCompatible]>; +def SVSCALE_N_X : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeAny, "aarch64_sve_fscale", [IsStreamingCompatible]>; +def SVSCALE_N_Z : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeZero, "aarch64_sve_fscale", [IsStreamingCompatible]>; + +defm SVMAD_F : SInstZPZZZ<"svmad", "hfd", "aarch64_sve_fmad", "aarch64_sve_fmla_u", [IsStreamingCompatible, ReverseMergeAnyAccOp]>; +defm SVMLA_F : SInstZPZZZ<"svmla", "hfd", "aarch64_sve_fmla", "aarch64_sve_fmla_u", [IsStreamingCompatible]>; +defm SVMLS_F : SInstZPZZZ<"svmls", "hfd", "aarch64_sve_fmls", "aarch64_sve_fmls_u", [IsStreamingCompatible]>; +defm SVMSB_F : SInstZPZZZ<"svmsb", "hfd", "aarch64_sve_fmsb", "aarch64_sve_fmls_u", [IsStreamingCompatible, ReverseMergeAnyAccOp]>; +defm SVNMAD_F : SInstZPZZZ<"svnmad", "hfd", "aarch64_sve_fnmad", "aarch64_sve_fnmla_u", [IsStreamingCompatible, ReverseMergeAnyAccOp]>; +defm SVNMLA_F : SInstZPZZZ<"svnmla", "hfd", "aarch64_sve_fnmla", "aarch64_sve_fnmla_u", [IsStreamingCompatible]>; +defm SVNMLS_F : SInstZPZZZ<"svnmls", "hfd", "aarch64_sve_fnmls", "aarch64_sve_fnmls_u", [IsStreamingCompatible]>; +defm SVNMSB_F : SInstZPZZZ<"svnmsb", "hfd", "aarch64_sve_fnmsb", "aarch64_sve_fnmls_u", [IsStreamingCompatible, ReverseMergeAnyAccOp]>; + +def SVCADD_M : SInst<"svcadd[_{d}]", "dPddi", "hfd", MergeOp1, "aarch64_sve_fcadd", [IsStreamingCompatible], [ImmCheck<3, ImmCheckComplexRot90_270>]>; +def SVCADD_X : SInst<"svcadd[_{d}]", "dPddi", "hfd", MergeAny, "aarch64_sve_fcadd", [IsStreamingCompatible], [ImmCheck<3, ImmCheckComplexRot90_270>]>; +def SVCADD_Z : SInst<"svcadd[_{d}]", "dPddi", "hfd", MergeZero, "aarch64_sve_fcadd", [IsStreamingCompatible], [ImmCheck<3, ImmCheckComplexRot90_270>]>; +def SVCMLA_M : SInst<"svcmla[_{d}]", "dPdddi", "hfd", MergeOp1, "aarch64_sve_fcmla", [IsStreamingCompatible], [ImmCheck<4, ImmCheckComplexRotAll90>]>; +def SVCMLA_X : SInst<"svcmla[_{d}]", "dPdddi", "hfd", MergeAny, "aarch64_sve_fcmla", [IsStreamingCompatible], [ImmCheck<4, ImmCheckComplexRotAll90>]>; +def SVCMLA_Z : SInst<"svcmla[_{d}]", "dPdddi", "hfd", MergeZero, "aarch64_sve_fcmla", [IsStreamingCompatible], [ImmCheck<4, ImmCheckComplexRotAll90>]>; + +def SVCMLA_LANE : SInst<"svcmla_lane[_{d}]", "ddddii", "hf", MergeNone, "aarch64_sve_fcmla_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndexCompRotate, 2>, ImmCheck<4, ImmCheckComplexRotAll90>]>; -def SVMLA_LANE : SInst<"svmla_lane[_{d}]", "ddddi", "hfd", MergeNone, "aarch64_sve_fmla_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLS_LANE : SInst<"svmls_lane[_{d}]", "ddddi", "hfd", MergeNone, "aarch64_sve_fmls_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMUL_LANE : SInst<"svmul_lane[_{d}]", "dddi", "hfd", MergeNone, "aarch64_sve_fmul_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVMLA_LANE : SInst<"svmla_lane[_{d}]", "ddddi", "hfd", MergeNone, "aarch64_sve_fmla_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLS_LANE : SInst<"svmls_lane[_{d}]", "ddddi", "hfd", MergeNone, "aarch64_sve_fmls_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMUL_LANE : SInst<"svmul_lane[_{d}]", "dddi", "hfd", MergeNone, "aarch64_sve_fmul_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; + +let TargetGuard = "sve2p1,b16b16" in { + def SVMUL_LANE_BF : SInst<"svmul_lane[_{d}]", "dddi", "b", MergeNone, "aarch64_sve_fmul_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +} -def SVRECPE : SInst<"svrecpe[_{d}]", "dd", "hfd", MergeNone, "aarch64_sve_frecpe_x">; -def SVRECPS : SInst<"svrecps[_{d}]", "ddd", "hfd", MergeNone, "aarch64_sve_frecps_x">; -def SVRSQRTE : SInst<"svrsqrte[_{d}]", "dd", "hfd", MergeNone, "aarch64_sve_frsqrte_x">; -def SVRSQRTS : SInst<"svrsqrts[_{d}]", "ddd", "hfd", MergeNone, "aarch64_sve_frsqrts_x">; +def SVRECPE : SInst<"svrecpe[_{d}]", "dd", "hfd", MergeNone, "aarch64_sve_frecpe_x", [IsStreamingCompatible]>; +def SVRECPS : SInst<"svrecps[_{d}]", "ddd", "hfd", MergeNone, "aarch64_sve_frecps_x", [IsStreamingCompatible]>; +def SVRSQRTE : SInst<"svrsqrte[_{d}]", "dd", "hfd", MergeNone, "aarch64_sve_frsqrte_x", [IsStreamingCompatible]>; +def SVRSQRTS : SInst<"svrsqrts[_{d}]", "ddd", "hfd", MergeNone, "aarch64_sve_frsqrts_x", [IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // Floating-point reductions def SVFADDA : SInst<"svadda[_{d}]", "sPsd", "hfd", MergeNone, "aarch64_sve_fadda">; -def SVFADDV : SInst<"svaddv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_faddv">; -def SVFMAXV : SInst<"svmaxv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_fmaxv">; -def SVFMAXNMV : SInst<"svmaxnmv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_fmaxnmv">; -def SVFMINV : SInst<"svminv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_fminv">; -def SVFMINNMV : SInst<"svminnmv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_fminnmv">; +def SVFADDV : SInst<"svaddv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_faddv", [IsStreamingCompatible]>; +def SVFMAXV : SInst<"svmaxv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_fmaxv", [IsStreamingCompatible]>; +def SVFMAXNMV : SInst<"svmaxnmv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_fmaxnmv", [IsStreamingCompatible]>; +def SVFMINV : SInst<"svminv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_fminv", [IsStreamingCompatible]>; +def SVFMINNMV : SInst<"svminnmv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_fminnmv", [IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // Floating-point comparisons -def SVACGE : SInst<"svacge[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_facge">; -def SVACGT : SInst<"svacgt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_facgt">; -def SVACLE : SInst<"svacle[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_facge", [ReverseCompare]>; -def SVACLT : SInst<"svaclt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_facgt", [ReverseCompare]>; -def SVCMPUO : SInst<"svcmpuo[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpuo">; - -def SVACGE_N : SInst<"svacge[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_facge">; -def SVACGT_N : SInst<"svacgt[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_facgt">; -def SVACLE_N : SInst<"svacle[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_facge", [ReverseCompare]>; -def SVACLT_N : SInst<"svaclt[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_facgt", [ReverseCompare]>; -def SVCMPUO_N : SInst<"svcmpuo[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpuo">; - -def SVCMPEQ_F : SInst<"svcmpeq[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpeq">; -def SVCMPNE_F : SInst<"svcmpne[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpne">; -def SVCMPGE_F : SInst<"svcmpge[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpge">; -def SVCMPGT_F : SInst<"svcmpgt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpgt">; -def SVCMPLE_F : SInst<"svcmple[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpge", [ReverseCompare]>; -def SVCMPLT_F : SInst<"svcmplt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpgt", [ReverseCompare]>; - -def SVCMPEQ_F_N : SInst<"svcmpeq[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpeq">; -def SVCMPNE_F_N : SInst<"svcmpne[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpne">; -def SVCMPGE_F_N : SInst<"svcmpge[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpge">; -def SVCMPGT_F_N : SInst<"svcmpgt[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpgt">; -def SVCMPLE_F_N : SInst<"svcmple[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpge", [ReverseCompare]>; -def SVCMPLT_F_N : SInst<"svcmplt[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpgt", [ReverseCompare]>; +def SVACGE : SInst<"svacge[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_facge", [IsStreamingCompatible]>; +def SVACGT : SInst<"svacgt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_facgt", [IsStreamingCompatible]>; +def SVACLE : SInst<"svacle[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_facge", [ReverseCompare, IsStreamingCompatible]>; +def SVACLT : SInst<"svaclt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_facgt", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPUO : SInst<"svcmpuo[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpuo", [IsStreamingCompatible]>; + +def SVACGE_N : SInst<"svacge[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_facge", [IsStreamingCompatible]>; +def SVACGT_N : SInst<"svacgt[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_facgt", [IsStreamingCompatible]>; +def SVACLE_N : SInst<"svacle[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_facge", [ReverseCompare, IsStreamingCompatible]>; +def SVACLT_N : SInst<"svaclt[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_facgt", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPUO_N : SInst<"svcmpuo[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpuo", [IsStreamingCompatible]>; + +def SVCMPEQ_F : SInst<"svcmpeq[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpeq", [IsStreamingCompatible]>; +def SVCMPNE_F : SInst<"svcmpne[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpne", [IsStreamingCompatible]>; +def SVCMPGE_F : SInst<"svcmpge[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpge", [IsStreamingCompatible]>; +def SVCMPGT_F : SInst<"svcmpgt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpgt", [IsStreamingCompatible]>; +def SVCMPLE_F : SInst<"svcmple[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpge", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPLT_F : SInst<"svcmplt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpgt", [ReverseCompare, IsStreamingCompatible]>; + +def SVCMPEQ_F_N : SInst<"svcmpeq[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpeq", [IsStreamingCompatible]>; +def SVCMPNE_F_N : SInst<"svcmpne[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpne", [IsStreamingCompatible]>; +def SVCMPGE_F_N : SInst<"svcmpge[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpge", [IsStreamingCompatible]>; +def SVCMPGT_F_N : SInst<"svcmpgt[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpgt", [IsStreamingCompatible]>; +def SVCMPLE_F_N : SInst<"svcmple[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpge", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPLT_F_N : SInst<"svcmplt[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpgt", [ReverseCompare, IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // Floating-point conversions @@ -1089,16 +886,16 @@ multiclass SInstCvtMXZ< string name, string m_types, string xz_types, string types, string intrinsic, list flags = [IsOverloadNone]> { - def _M : SInst; - def _X : SInst; - def _Z : SInst; + def _M : SInst; + def _X : SInst; + def _Z : SInst; } multiclass SInstCvtMX flags = [IsOverloadNone]> { - def _M : SInst; - def _X : SInst; + def _M : SInst; + def _X : SInst; } // svcvt_s##_f16 @@ -1112,7 +909,7 @@ let TargetGuard = "sve,bf16" in { defm SVCVT_BF16_F32 : SInstCvtMXZ<"svcvt_bf16[_f32]", "ddPM", "dPM", "b", "aarch64_sve_fcvt_bf16f32">; - def SVCVTNT_BF16_F32 : SInst<"svcvtnt_bf16[_f32]", "ddPM", "b", MergeOp1, "aarch64_sve_fcvtnt_bf16f32", [IsOverloadNone]>; + def SVCVTNT_BF16_F32 : SInst<"svcvtnt_bf16[_f32]", "ddPM", "b", MergeOp1, "aarch64_sve_fcvtnt_bf16f32", [IsOverloadNone, IsStreamingCompatible]>; } // svcvt_s##_f64 @@ -1176,11 +973,11 @@ defm SVCVTX_F32 : SInstCvtMXZ<"svcvtx_f32[_f64]", "MMPd", "MPd", "d", "aarch64_sve_fcvtx_f32f64">; -def SVCVTNT_F32 : SInst<"svcvtnt_f16[_f32]", "hhPd", "f", MergeOp1, "aarch64_sve_fcvtnt_f16f32", [IsOverloadNone]>; -def SVCVTNT_F64 : SInst<"svcvtnt_f32[_f64]", "hhPd", "d", MergeOp1, "aarch64_sve_fcvtnt_f32f64", [IsOverloadNone]>; +def SVCVTNT_F32 : SInst<"svcvtnt_f16[_f32]", "hhPd", "f", MergeOp1, "aarch64_sve_fcvtnt_f16f32", [IsOverloadNone, IsStreamingCompatible]>; +def SVCVTNT_F64 : SInst<"svcvtnt_f32[_f64]", "hhPd", "d", MergeOp1, "aarch64_sve_fcvtnt_f32f64", [IsOverloadNone, IsStreamingCompatible]>; // SVCVTNT_X : Implemented as macro by SveEmitter.cpp -def SVCVTXNT_F32 : SInst<"svcvtxnt_f32[_f64]", "MMPd", "d", MergeOp1, "aarch64_sve_fcvtxnt_f32f64", [IsOverloadNone]>; +def SVCVTXNT_F32 : SInst<"svcvtxnt_f32[_f64]", "MMPd", "d", MergeOp1, "aarch64_sve_fcvtxnt_f32f64", [IsOverloadNone, IsStreamingCompatible]>; // SVCVTXNT_X_F32 : Implemented as macro by SveEmitter.cpp } @@ -1189,9 +986,9 @@ // Permutations and selection multiclass SVEPerm { - def : SInst; + def : SInst; let TargetGuard = "sve,bf16" in { - def: SInst; + def: SInst; } } @@ -1205,129 +1002,133 @@ // splat of any possible lane. It is upto LLVM to pick a more efficient // instruction such as DUP (indexed) if the lane index fits the range of the // instruction's immediate. -def SVDUP_LANE : SInst<"svdup_lane[_{d}]", "ddL", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_tbl">; +def SVDUP_LANE : SInst<"svdup_lane[_{d}]", "ddL", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_tbl", [IsStreamingCompatible]>; let TargetGuard = "sve,bf16" in { def SVDUP_LANE_BF16 : - SInst<"svdup_lane[_{d}]", "ddL", "b", MergeNone, "aarch64_sve_tbl">; + SInst<"svdup_lane[_{d}]", "ddL", "b", MergeNone, "aarch64_sve_tbl", [IsStreamingCompatible]>; } -def SVDUPQ_LANE : SInst<"svdupq_lane[_{d}]", "ddn", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_dupq_lane">; +def SVDUPQ_LANE : SInst<"svdupq_lane[_{d}]", "ddn", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_dupq_lane", [IsStreamingCompatible]>; let TargetGuard = "sve,bf16" in { - def SVDUPQ_LANE_BF16 : SInst<"svdupq_lane[_{d}]", "ddn", "b", MergeNone, "aarch64_sve_dupq_lane">; + def SVDUPQ_LANE_BF16 : SInst<"svdupq_lane[_{d}]", "ddn", "b", MergeNone, "aarch64_sve_dupq_lane", [IsStreamingCompatible]>; } -def SVEXT : SInst<"svext[_{d}]", "dddi", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ext", [], [ImmCheck<2, ImmCheckExtract, 1>]>; +def SVEXT : SInst<"svext[_{d}]", "dddi", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ext", [IsStreamingCompatible], [ImmCheck<2, ImmCheckExtract, 1>]>; defm SVLASTA : SVEPerm<"svlasta[_{d}]", "sPd", "aarch64_sve_lasta">; defm SVLASTB : SVEPerm<"svlastb[_{d}]", "sPd", "aarch64_sve_lastb">; -def SVREV : SInst<"svrev[_{d}]", "dd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_rev">; -def SVSEL : SInst<"svsel[_{d}]", "dPdd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_sel">; -def SVSPLICE : SInst<"svsplice[_{d}]", "dPdd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_splice">; -def SVTBL : SInst<"svtbl[_{d}]", "ddu", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_tbl">; +def SVREV : SInst<"svrev[_{d}]", "dd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_rev", [IsStreamingCompatible]>; +def SVSEL : SInst<"svsel[_{d}]", "dPdd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_sel", [IsStreamingCompatible]>; +def SVSPLICE : SInst<"svsplice[_{d}]", "dPdd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_splice", [IsStreamingCompatible]>; +def SVTBL : SInst<"svtbl[_{d}]", "ddu", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_tbl", [IsStreamingCompatible]>; let TargetGuard = "sve,bf16" in { - def SVTBL_BF16 : SInst<"svtbl[_{d}]", "ddu", "b", MergeNone, "aarch64_sve_tbl">; + def SVTBL_BF16 : SInst<"svtbl[_{d}]", "ddu", "b", MergeNone, "aarch64_sve_tbl", [IsStreamingCompatible]>; } -def SVTRN1 : SInst<"svtrn1[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn1">; -def SVTRN2 : SInst<"svtrn2[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn2">; -def SVUNPKHI_S : SInst<"svunpkhi[_{d}]", "dh", "sil", MergeNone, "aarch64_sve_sunpkhi">; -def SVUNPKHI_U : SInst<"svunpkhi[_{d}]", "dh", "UsUiUl", MergeNone, "aarch64_sve_uunpkhi">; -def SVUNPKLO_S : SInst<"svunpklo[_{d}]", "dh", "sil", MergeNone, "aarch64_sve_sunpklo">; -def SVUNPKLO_U : SInst<"svunpklo[_{d}]", "dh", "UsUiUl", MergeNone, "aarch64_sve_uunpklo">; -def SVUZP1 : SInst<"svuzp1[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp1">; -def SVUZP2 : SInst<"svuzp2[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp2">; -def SVZIP1 : SInst<"svzip1[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip1">; -def SVZIP2 : SInst<"svzip2[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip2">; +def SVTRN1 : SInst<"svtrn1[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn1", [IsStreamingCompatible]>; +def SVTRN2 : SInst<"svtrn2[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn2", [IsStreamingCompatible]>; +def SVUNPKHI_S : SInst<"svunpkhi[_{d}]", "dh", "sil", MergeNone, "aarch64_sve_sunpkhi", [IsStreamingCompatible]>; +def SVUNPKHI_U : SInst<"svunpkhi[_{d}]", "dh", "UsUiUl", MergeNone, "aarch64_sve_uunpkhi", [IsStreamingCompatible]>; +def SVUNPKLO_S : SInst<"svunpklo[_{d}]", "dh", "sil", MergeNone, "aarch64_sve_sunpklo", [IsStreamingCompatible]>; +def SVUNPKLO_U : SInst<"svunpklo[_{d}]", "dh", "UsUiUl", MergeNone, "aarch64_sve_uunpklo", [IsStreamingCompatible]>; +def SVUZP1 : SInst<"svuzp1[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp1", [IsStreamingCompatible]>; +def SVUZP2 : SInst<"svuzp2[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp2", [IsStreamingCompatible]>; +def SVZIP1 : SInst<"svzip1[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip1", [IsStreamingCompatible]>; +def SVZIP2 : SInst<"svzip2[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip2", [IsStreamingCompatible]>; let TargetGuard = "sve,bf16" in { -def SVEXT_BF16 : SInst<"svext[_{d}]", "dddi", "b", MergeNone, "aarch64_sve_ext", [], [ImmCheck<2, ImmCheckExtract, 1>]>; -def SVREV_BF16 : SInst<"svrev[_{d}]", "dd", "b", MergeNone, "aarch64_sve_rev">; -def SVSEL_BF16 : SInst<"svsel[_{d}]", "dPdd", "b", MergeNone, "aarch64_sve_sel">; -def SVSPLICE_BF16 : SInst<"svsplice[_{d}]", "dPdd", "b", MergeNone, "aarch64_sve_splice">; -def SVTRN1_BF16 : SInst<"svtrn1[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_trn1">; -def SVTRN2_BF16 : SInst<"svtrn2[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_trn2">; -def SVUZP1_BF16 : SInst<"svuzp1[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_uzp1">; -def SVUZP2_BF16 : SInst<"svuzp2[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_uzp2">; -def SVZIP1_BF16 : SInst<"svzip1[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_zip1">; -def SVZIP2_BF16 : SInst<"svzip2[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_zip2">; -} - -def SVREV_B8 : SInst<"svrev_b8", "PP", "Pc", MergeNone, "aarch64_sve_rev">; -def SVREV_B16 : SInst<"svrev_b16", "PP", "Pc", MergeNone, "aarch64_sve_rev_b16", [IsOverloadNone]>; -def SVREV_B32 : SInst<"svrev_b32", "PP", "Pc", MergeNone, "aarch64_sve_rev_b32", [IsOverloadNone]>; -def SVREV_B64 : SInst<"svrev_b64", "PP", "Pc", MergeNone, "aarch64_sve_rev_b64", [IsOverloadNone]>; -def SVSEL_B : SInst<"svsel[_b]", "PPPP", "Pc", MergeNone, "aarch64_sve_sel">; -def SVTRN1_B8 : SInst<"svtrn1_b8", "PPP", "Pc", MergeNone, "aarch64_sve_trn1">; -def SVTRN1_B16 : SInst<"svtrn1_b16", "PPP", "Pc", MergeNone, "aarch64_sve_trn1_b16", [IsOverloadNone]>; -def SVTRN1_B32 : SInst<"svtrn1_b32", "PPP", "Pc", MergeNone, "aarch64_sve_trn1_b32", [IsOverloadNone]>; -def SVTRN1_B64 : SInst<"svtrn1_b64", "PPP", "Pc", MergeNone, "aarch64_sve_trn1_b64", [IsOverloadNone]>; -def SVTRN2_B8 : SInst<"svtrn2_b8", "PPP", "Pc", MergeNone, "aarch64_sve_trn2">; -def SVTRN2_B16 : SInst<"svtrn2_b16", "PPP", "Pc", MergeNone, "aarch64_sve_trn2_b16", [IsOverloadNone]>; -def SVTRN2_B32 : SInst<"svtrn2_b32", "PPP", "Pc", MergeNone, "aarch64_sve_trn2_b32", [IsOverloadNone]>; -def SVTRN2_B64 : SInst<"svtrn2_b64", "PPP", "Pc", MergeNone, "aarch64_sve_trn2_b64", [IsOverloadNone]>; -def SVPUNPKHI : SInst<"svunpkhi[_b]", "PP", "Pc", MergeNone, "aarch64_sve_punpkhi">; -def SVPUNPKLO : SInst<"svunpklo[_b]", "PP", "Pc", MergeNone, "aarch64_sve_punpklo">; -def SVUZP1_B8 : SInst<"svuzp1_b8", "PPP", "Pc", MergeNone, "aarch64_sve_uzp1">; -def SVUZP1_B16 : SInst<"svuzp1_b16", "PPP", "Pc", MergeNone, "aarch64_sve_uzp1_b16", [IsOverloadNone]>; -def SVUZP1_B32 : SInst<"svuzp1_b32", "PPP", "Pc", MergeNone, "aarch64_sve_uzp1_b32", [IsOverloadNone]>; -def SVUZP1_B64 : SInst<"svuzp1_b64", "PPP", "Pc", MergeNone, "aarch64_sve_uzp1_b64", [IsOverloadNone]>; -def SVUZP2_B8 : SInst<"svuzp2_b8", "PPP", "Pc", MergeNone, "aarch64_sve_uzp2">; -def SVUZP2_B16 : SInst<"svuzp2_b16", "PPP", "Pc", MergeNone, "aarch64_sve_uzp2_b16", [IsOverloadNone]>; -def SVUZP2_B32 : SInst<"svuzp2_b32", "PPP", "Pc", MergeNone, "aarch64_sve_uzp2_b32", [IsOverloadNone]>; -def SVUZP2_B64 : SInst<"svuzp2_b64", "PPP", "Pc", MergeNone, "aarch64_sve_uzp2_b64", [IsOverloadNone]>; -def SVZIP1_B8 : SInst<"svzip1_b8", "PPP", "Pc", MergeNone, "aarch64_sve_zip1">; -def SVZIP1_B16 : SInst<"svzip1_b16", "PPP", "Pc", MergeNone, "aarch64_sve_zip1_b16", [IsOverloadNone]>; -def SVZIP1_B32 : SInst<"svzip1_b32", "PPP", "Pc", MergeNone, "aarch64_sve_zip1_b32", [IsOverloadNone]>; -def SVZIP1_B64 : SInst<"svzip1_b64", "PPP", "Pc", MergeNone, "aarch64_sve_zip1_b64", [IsOverloadNone]>; -def SVZIP2_B : SInst<"svzip2_b8", "PPP", "Pc", MergeNone, "aarch64_sve_zip2">; -def SVZIP2_B16 : SInst<"svzip2_b16", "PPP", "Pc", MergeNone, "aarch64_sve_zip2_b16", [IsOverloadNone]>; -def SVZIP2_B32 : SInst<"svzip2_b32", "PPP", "Pc", MergeNone, "aarch64_sve_zip2_b32", [IsOverloadNone]>; -def SVZIP2_B64 : SInst<"svzip2_b64", "PPP", "Pc", MergeNone, "aarch64_sve_zip2_b64", [IsOverloadNone]>; +def SVEXT_BF16 : SInst<"svext[_{d}]", "dddi", "b", MergeNone, "aarch64_sve_ext", [IsStreamingCompatible], [ImmCheck<2, ImmCheckExtract, 1>]>; +def SVREV_BF16 : SInst<"svrev[_{d}]", "dd", "b", MergeNone, "aarch64_sve_rev", [IsStreamingCompatible]>; +def SVSEL_BF16 : SInst<"svsel[_{d}]", "dPdd", "b", MergeNone, "aarch64_sve_sel", [IsStreamingCompatible]>; +def SVSPLICE_BF16 : SInst<"svsplice[_{d}]", "dPdd", "b", MergeNone, "aarch64_sve_splice", [IsStreamingCompatible]>; +def SVTRN1_BF16 : SInst<"svtrn1[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_trn1", [IsStreamingCompatible]>; +def SVTRN2_BF16 : SInst<"svtrn2[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_trn2", [IsStreamingCompatible]>; +def SVUZP1_BF16 : SInst<"svuzp1[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_uzp1", [IsStreamingCompatible]>; +def SVUZP2_BF16 : SInst<"svuzp2[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_uzp2", [IsStreamingCompatible]>; +def SVZIP1_BF16 : SInst<"svzip1[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_zip1", [IsStreamingCompatible]>; +def SVZIP2_BF16 : SInst<"svzip2[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_zip2", [IsStreamingCompatible]>; +} + +def SVREV_B8 : SInst<"svrev_b8", "PP", "Pc", MergeNone, "aarch64_sve_rev", [IsStreamingCompatible]>; +def SVREV_B16 : SInst<"svrev_b16", "PP", "Pc", MergeNone, "aarch64_sve_rev_b16", [IsOverloadNone, IsStreamingCompatible]>; +def SVREV_B32 : SInst<"svrev_b32", "PP", "Pc", MergeNone, "aarch64_sve_rev_b32", [IsOverloadNone, IsStreamingCompatible]>; +def SVREV_B64 : SInst<"svrev_b64", "PP", "Pc", MergeNone, "aarch64_sve_rev_b64", [IsOverloadNone, IsStreamingCompatible]>; +def SVSEL_B : SInst<"svsel[_b]", "PPPP", "Pc", MergeNone, "aarch64_sve_sel", [IsStreamingCompatible]>; +def SVTRN1_B8 : SInst<"svtrn1_b8", "PPP", "Pc", MergeNone, "aarch64_sve_trn1", [IsStreamingCompatible]>; +def SVTRN1_B16 : SInst<"svtrn1_b16", "PPP", "Pc", MergeNone, "aarch64_sve_trn1_b16", [IsOverloadNone, IsStreamingCompatible]>; +def SVTRN1_B32 : SInst<"svtrn1_b32", "PPP", "Pc", MergeNone, "aarch64_sve_trn1_b32", [IsOverloadNone, IsStreamingCompatible]>; +def SVTRN1_B64 : SInst<"svtrn1_b64", "PPP", "Pc", MergeNone, "aarch64_sve_trn1_b64", [IsOverloadNone, IsStreamingCompatible]>; +def SVTRN2_B8 : SInst<"svtrn2_b8", "PPP", "Pc", MergeNone, "aarch64_sve_trn2", [IsStreamingCompatible]>; +def SVTRN2_B16 : SInst<"svtrn2_b16", "PPP", "Pc", MergeNone, "aarch64_sve_trn2_b16", [IsOverloadNone, IsStreamingCompatible]>; +def SVTRN2_B32 : SInst<"svtrn2_b32", "PPP", "Pc", MergeNone, "aarch64_sve_trn2_b32", [IsOverloadNone, IsStreamingCompatible]>; +def SVTRN2_B64 : SInst<"svtrn2_b64", "PPP", "Pc", MergeNone, "aarch64_sve_trn2_b64", [IsOverloadNone, IsStreamingCompatible]>; +def SVPUNPKHI : SInst<"svunpkhi[_b]", "PP", "Pc", MergeNone, "aarch64_sve_punpkhi", [IsStreamingCompatible]>; +def SVPUNPKLO : SInst<"svunpklo[_b]", "PP", "Pc", MergeNone, "aarch64_sve_punpklo", [IsStreamingCompatible]>; +def SVUZP1_B8 : SInst<"svuzp1_b8", "PPP", "Pc", MergeNone, "aarch64_sve_uzp1", [IsStreamingCompatible]>; +def SVUZP1_B16 : SInst<"svuzp1_b16", "PPP", "Pc", MergeNone, "aarch64_sve_uzp1_b16", [IsOverloadNone, IsStreamingCompatible]>; +def SVUZP1_B32 : SInst<"svuzp1_b32", "PPP", "Pc", MergeNone, "aarch64_sve_uzp1_b32", [IsOverloadNone, IsStreamingCompatible]>; +def SVUZP1_B64 : SInst<"svuzp1_b64", "PPP", "Pc", MergeNone, "aarch64_sve_uzp1_b64", [IsOverloadNone, IsStreamingCompatible]>; +def SVUZP2_B8 : SInst<"svuzp2_b8", "PPP", "Pc", MergeNone, "aarch64_sve_uzp2", [IsStreamingCompatible]>; +def SVUZP2_B16 : SInst<"svuzp2_b16", "PPP", "Pc", MergeNone, "aarch64_sve_uzp2_b16", [IsOverloadNone, IsStreamingCompatible]>; +def SVUZP2_B32 : SInst<"svuzp2_b32", "PPP", "Pc", MergeNone, "aarch64_sve_uzp2_b32", [IsOverloadNone, IsStreamingCompatible]>; +def SVUZP2_B64 : SInst<"svuzp2_b64", "PPP", "Pc", MergeNone, "aarch64_sve_uzp2_b64", [IsOverloadNone, IsStreamingCompatible]>; +def SVZIP1_B8 : SInst<"svzip1_b8", "PPP", "Pc", MergeNone, "aarch64_sve_zip1", [IsStreamingCompatible]>; +def SVZIP1_B16 : SInst<"svzip1_b16", "PPP", "Pc", MergeNone, "aarch64_sve_zip1_b16", [IsOverloadNone, IsStreamingCompatible]>; +def SVZIP1_B32 : SInst<"svzip1_b32", "PPP", "Pc", MergeNone, "aarch64_sve_zip1_b32", [IsOverloadNone, IsStreamingCompatible]>; +def SVZIP1_B64 : SInst<"svzip1_b64", "PPP", "Pc", MergeNone, "aarch64_sve_zip1_b64", [IsOverloadNone, IsStreamingCompatible]>; +def SVZIP2_B : SInst<"svzip2_b8", "PPP", "Pc", MergeNone, "aarch64_sve_zip2", [IsStreamingCompatible]>; +def SVZIP2_B16 : SInst<"svzip2_b16", "PPP", "Pc", MergeNone, "aarch64_sve_zip2_b16", [IsOverloadNone, IsStreamingCompatible]>; +def SVZIP2_B32 : SInst<"svzip2_b32", "PPP", "Pc", MergeNone, "aarch64_sve_zip2_b32", [IsOverloadNone, IsStreamingCompatible]>; +def SVZIP2_B64 : SInst<"svzip2_b64", "PPP", "Pc", MergeNone, "aarch64_sve_zip2_b64", [IsOverloadNone, IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // Predicate creation -def SVPFALSE : SInst<"svpfalse[_b]", "Pv", "", MergeNone, "", [IsOverloadNone]>; +def SVPFALSE : SInst<"svpfalse[_b]", "Pv", "", MergeNone, "", [IsOverloadNone, IsStreamingCompatible]>; + +let TargetGuard = "sme2" in { +def SVPFALSE_COUNT_ALIAS : SInst<"svpfalse_c", "}v", "", MergeNone, "", [IsOverloadNone, IsStreamingCompatible]>; +} -def SVPTRUE_PAT : SInst<"svptrue_pat_{d}", "PI", "PcPsPiPl", MergeNone, "aarch64_sve_ptrue">; -def SVPTRUE : SInst<"svptrue_{d}", "Pv", "PcPsPiPl", MergeNone, "aarch64_sve_ptrue", [IsAppendSVALL]>; +def SVPTRUE_PAT : SInst<"svptrue_pat_{d}", "PI", "PcPsPiPl", MergeNone, "aarch64_sve_ptrue", [IsStreamingCompatible]>; +def SVPTRUE : SInst<"svptrue_{d}", "Pv", "PcPsPiPl", MergeNone, "aarch64_sve_ptrue", [IsStreamingCompatible, IsAppendSVALL]>; -def SVDUPQ_B8 : SInst<"svdupq[_n]_{d}", "Pssssssssssssssss", "Pc", MergeNone>; -def SVDUPQ_B16 : SInst<"svdupq[_n]_{d}", "Pssssssss", "Ps", MergeNone>; -def SVDUPQ_B32 : SInst<"svdupq[_n]_{d}", "Pssss", "Pi", MergeNone>; -def SVDUPQ_B64 : SInst<"svdupq[_n]_{d}", "Pss", "Pl", MergeNone>; -def SVDUP_N_B : SInst<"svdup[_n]_{d}", "Ps", "PcPsPiPl", MergeNone>; +def SVDUPQ_B8 : SInst<"svdupq[_n]_{d}", "Pssssssssssssssss", "Pc", MergeNone, "", [IsStreamingCompatible]>; +def SVDUPQ_B16 : SInst<"svdupq[_n]_{d}", "Pssssssss", "Ps", MergeNone, "", [IsStreamingCompatible]>; +def SVDUPQ_B32 : SInst<"svdupq[_n]_{d}", "Pssss", "Pi", MergeNone, "", [IsStreamingCompatible]>; +def SVDUPQ_B64 : SInst<"svdupq[_n]_{d}", "Pss", "Pl", MergeNone, "", [IsStreamingCompatible]>; +def SVDUP_N_B : SInst<"svdup[_n]_{d}", "Ps", "PcPsPiPl", MergeNone, "", [IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // Predicate operations -def SVAND_B_Z : SInst<"svand[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_and_z">; -def SVBIC_B_Z : SInst<"svbic[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_bic_z">; -def SVEOR_B_Z : SInst<"sveor[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_eor_z">; -def SVMOV_B_Z : SInst<"svmov[_b]_z", "PPP", "Pc", MergeNone>; // Uses custom expansion -def SVNAND_B_Z : SInst<"svnand[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_nand_z">; -def SVNOR_B_Z : SInst<"svnor[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_nor_z">; -def SVNOT_B_Z : SInst<"svnot[_b]_z", "PPP", "Pc", MergeNone>; // Uses custom expansion -def SVORN_B_Z : SInst<"svorn[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_orn_z">; -def SVORR_B_Z : SInst<"svorr[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_orr_z">; - -def SVBRKA : SInst<"svbrka[_b]_m", "PPPP", "Pc", MergeNone, "aarch64_sve_brka">; -def SVBRKA_Z : SInst<"svbrka[_b]_z", "PPP", "Pc", MergeNone, "aarch64_sve_brka_z">; -def SVBRKB : SInst<"svbrkb[_b]_m", "PPPP", "Pc", MergeNone, "aarch64_sve_brkb">; -def SVBRKB_Z : SInst<"svbrkb[_b]_z", "PPP", "Pc", MergeNone, "aarch64_sve_brkb_z">; -def SVBRKN_Z : SInst<"svbrkn[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_brkn_z">; -def SVBRKPA_Z : SInst<"svbrkpa[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_brkpa_z">; -def SVBRKPB_Z : SInst<"svbrkpb[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_brkpb_z">; - -def SVPFIRST : SInst<"svpfirst[_b]", "PPP", "Pc", MergeNone, "aarch64_sve_pfirst">; -def SVPNEXT : SInst<"svpnext_{d}", "PPP", "PcPsPiPl", MergeNone, "aarch64_sve_pnext">; +def SVAND_B_Z : SInst<"svand[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_and_z", [IsStreamingCompatible]>; +def SVBIC_B_Z : SInst<"svbic[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_bic_z", [IsStreamingCompatible]>; +def SVEOR_B_Z : SInst<"sveor[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_eor_z", [IsStreamingCompatible]>; +def SVMOV_B_Z : SInst<"svmov[_b]_z", "PPP", "Pc", MergeNone, "", [IsStreamingCompatible]>; // Uses custom expansion +def SVNAND_B_Z : SInst<"svnand[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_nand_z", [IsStreamingCompatible]>; +def SVNOR_B_Z : SInst<"svnor[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_nor_z", [IsStreamingCompatible]>; +def SVNOT_B_Z : SInst<"svnot[_b]_z", "PPP", "Pc", MergeNone, "", [IsStreamingCompatible]>; // Uses custom expansion +def SVORN_B_Z : SInst<"svorn[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_orn_z", [IsStreamingCompatible]>; +def SVORR_B_Z : SInst<"svorr[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_orr_z", [IsStreamingCompatible]>; + +def SVBRKA : SInst<"svbrka[_b]_m", "PPPP", "Pc", MergeNone, "aarch64_sve_brka", [IsStreamingCompatible]>; +def SVBRKA_Z : SInst<"svbrka[_b]_z", "PPP", "Pc", MergeNone, "aarch64_sve_brka_z", [IsStreamingCompatible]>; +def SVBRKB : SInst<"svbrkb[_b]_m", "PPPP", "Pc", MergeNone, "aarch64_sve_brkb", [IsStreamingCompatible]>; +def SVBRKB_Z : SInst<"svbrkb[_b]_z", "PPP", "Pc", MergeNone, "aarch64_sve_brkb_z", [IsStreamingCompatible]>; +def SVBRKN_Z : SInst<"svbrkn[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_brkn_z", [IsStreamingCompatible]>; +def SVBRKPA_Z : SInst<"svbrkpa[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_brkpa_z", [IsStreamingCompatible]>; +def SVBRKPB_Z : SInst<"svbrkpb[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_brkpb_z", [IsStreamingCompatible]>; + +def SVPFIRST : SInst<"svpfirst[_b]", "PPP", "Pc", MergeNone, "aarch64_sve_pfirst", [IsStreamingCompatible]>; +def SVPNEXT : SInst<"svpnext_{d}", "PPP", "PcPsPiPl", MergeNone, "aarch64_sve_pnext", [IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // Testing predicates -def SVPTEST_ANY : SInst<"svptest_any", "sPP", "Pc", MergeNone, "aarch64_sve_ptest_any">; -def SVPTEST_FIRST : SInst<"svptest_first", "sPP", "Pc", MergeNone, "aarch64_sve_ptest_first">; -def SVPTEST_LAST : SInst<"svptest_last", "sPP", "Pc", MergeNone, "aarch64_sve_ptest_last">; +def SVPTEST_ANY : SInst<"svptest_any", "sPP", "Pc", MergeNone, "aarch64_sve_ptest_any", [IsStreamingCompatible]>; +def SVPTEST_FIRST : SInst<"svptest_first", "sPP", "Pc", MergeNone, "aarch64_sve_ptest_first", [IsStreamingCompatible]>; +def SVPTEST_LAST : SInst<"svptest_last", "sPP", "Pc", MergeNone, "aarch64_sve_ptest_last", [IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // FFR manipulation @@ -1340,21 +1141,21 @@ //////////////////////////////////////////////////////////////////////////////// // Counting elements -def SVCNTB_PAT : SInst<"svcntb_pat", "nI", "", MergeNone, "aarch64_sve_cntb", [IsOverloadNone]>; -def SVCNTH_PAT : SInst<"svcnth_pat", "nI", "", MergeNone, "aarch64_sve_cnth", [IsOverloadNone]>; -def SVCNTW_PAT : SInst<"svcntw_pat", "nI", "", MergeNone, "aarch64_sve_cntw", [IsOverloadNone]>; -def SVCNTD_PAT : SInst<"svcntd_pat", "nI", "", MergeNone, "aarch64_sve_cntd", [IsOverloadNone]>; +def SVCNTB_PAT : SInst<"svcntb_pat", "nI", "", MergeNone, "aarch64_sve_cntb", [IsOverloadNone, IsStreamingCompatible]>; +def SVCNTH_PAT : SInst<"svcnth_pat", "nI", "", MergeNone, "aarch64_sve_cnth", [IsOverloadNone, IsStreamingCompatible]>; +def SVCNTW_PAT : SInst<"svcntw_pat", "nI", "", MergeNone, "aarch64_sve_cntw", [IsOverloadNone, IsStreamingCompatible]>; +def SVCNTD_PAT : SInst<"svcntd_pat", "nI", "", MergeNone, "aarch64_sve_cntd", [IsOverloadNone, IsStreamingCompatible]>; -def SVCNTB : SInst<"svcntb", "nv", "", MergeNone, "aarch64_sve_cntb", [IsAppendSVALL, IsOverloadNone]>; -def SVCNTH : SInst<"svcnth", "nv", "", MergeNone, "aarch64_sve_cnth", [IsAppendSVALL, IsOverloadNone]>; -def SVCNTW : SInst<"svcntw", "nv", "", MergeNone, "aarch64_sve_cntw", [IsAppendSVALL, IsOverloadNone]>; -def SVCNTD : SInst<"svcntd", "nv", "", MergeNone, "aarch64_sve_cntd", [IsAppendSVALL, IsOverloadNone]>; +def SVCNTB : SInst<"svcntb", "nv", "", MergeNone, "aarch64_sve_cntb", [IsAppendSVALL, IsOverloadNone, IsStreamingCompatible]>; +def SVCNTH : SInst<"svcnth", "nv", "", MergeNone, "aarch64_sve_cnth", [IsAppendSVALL, IsOverloadNone, IsStreamingCompatible]>; +def SVCNTW : SInst<"svcntw", "nv", "", MergeNone, "aarch64_sve_cntw", [IsAppendSVALL, IsOverloadNone, IsStreamingCompatible]>; +def SVCNTD : SInst<"svcntd", "nv", "", MergeNone, "aarch64_sve_cntd", [IsAppendSVALL, IsOverloadNone, IsStreamingCompatible]>; -def SVCNTP : SInst<"svcntp_{d}", "nPP", "PcPsPiPl", MergeNone, "aarch64_sve_cntp">; -def SVLEN : SInst<"svlen[_{d}]", "nd", "csilUcUsUiUlhfd", MergeNone>; +def SVCNTP : SInst<"svcntp_{d}", "nPP", "PcPsPiPl", MergeNone, "aarch64_sve_cntp", [IsStreamingCompatible]>; +def SVLEN : SInst<"svlen[_{d}]", "nd", "csilUcUsUiUlhfd", MergeNone, "", [IsStreamingCompatible]>; let TargetGuard = "sve,bf16" in { -def SVLEN_BF16 : SInst<"svlen[_{d}]", "nd", "b", MergeNone>; +def SVLEN_BF16 : SInst<"svlen[_{d}]", "nd", "b", MergeNone, "", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// @@ -1371,20 +1172,20 @@ def UnsignedDoubleWord : sat_type<"U", "Ul">; multiclass SInst_SAT1 { - def _N32 : SInst]>; - def _N64 : SInst]>; - def _N32_ALL : SInst]>; - def _N64_ALL : SInst]>; + def _N32 : SInst]>; + def _N64 : SInst]>; + def _N32_ALL : SInst]>; + def _N64_ALL : SInst]>; } multiclass SInst_SAT2 { - def "" : SInst]>; - def _ALL : SInst]>; + def "" : SInst]>; + def _ALL : SInst]>; - def _N32 : SInst]>; - def _N64 : SInst]>; - def _N32_ALL : SInst]>; - def _N64_ALL : SInst]>; + def _N32 : SInst]>; + def _N64 : SInst]>; + def _N32_ALL : SInst]>; + def _N64_ALL : SInst]>; } defm SVQDECB_S : SInst_SAT1<"svqdecb", "aarch64_sve_sqdecb", SignedByte>; @@ -1405,32 +1206,32 @@ defm SVQINCD_S : SInst_SAT2<"svqincd", "aarch64_sve_sqincd", SignedDoubleWord>; defm SVQINCD_U : SInst_SAT2<"svqincd", "aarch64_sve_uqincd", UnsignedDoubleWord>; -def SVQDECP_S : SInst<"svqdecp[_{d}]", "ddP", "sil", MergeNone, "aarch64_sve_sqdecp">; -def SVQDECP_U : SInst<"svqdecp[_{d}]", "ddP", "UsUiUl", MergeNone, "aarch64_sve_uqdecp">; -def SVQINCP_S : SInst<"svqincp[_{d}]", "ddP", "sil", MergeNone, "aarch64_sve_sqincp">; -def SVQINCP_U : SInst<"svqincp[_{d}]", "ddP", "UsUiUl", MergeNone, "aarch64_sve_uqincp">; +def SVQDECP_S : SInst<"svqdecp[_{d}]", "ddP", "sil", MergeNone, "aarch64_sve_sqdecp", [IsStreamingCompatible]>; +def SVQDECP_U : SInst<"svqdecp[_{d}]", "ddP", "UsUiUl", MergeNone, "aarch64_sve_uqdecp", [IsStreamingCompatible]>; +def SVQINCP_S : SInst<"svqincp[_{d}]", "ddP", "sil", MergeNone, "aarch64_sve_sqincp", [IsStreamingCompatible]>; +def SVQINCP_U : SInst<"svqincp[_{d}]", "ddP", "UsUiUl", MergeNone, "aarch64_sve_uqincp", [IsStreamingCompatible]>; -def SVQDECP_N_S32 : SInst<"svqdecp[_n_s32]_{d}", "kkP", "PcPsPiPl", MergeNone, "aarch64_sve_sqdecp_n32">; -def SVQDECP_N_S64 : SInst<"svqdecp[_n_s64]_{d}", "llP", "PcPsPiPl", MergeNone, "aarch64_sve_sqdecp_n64">; -def SVQDECP_N_U32 : SInst<"svqdecp[_n_u32]_{d}", "mmP", "PcPsPiPl", MergeNone, "aarch64_sve_uqdecp_n32">; -def SVQDECP_N_U64 : SInst<"svqdecp[_n_u64]_{d}", "nnP", "PcPsPiPl", MergeNone, "aarch64_sve_uqdecp_n64">; -def SVQINCP_N_S32 : SInst<"svqincp[_n_s32]_{d}", "kkP", "PcPsPiPl", MergeNone, "aarch64_sve_sqincp_n32">; -def SVQINCP_N_S64 : SInst<"svqincp[_n_s64]_{d}", "llP", "PcPsPiPl", MergeNone, "aarch64_sve_sqincp_n64">; -def SVQINCP_N_U32 : SInst<"svqincp[_n_u32]_{d}", "mmP", "PcPsPiPl", MergeNone, "aarch64_sve_uqincp_n32">; -def SVQINCP_N_U64 : SInst<"svqincp[_n_u64]_{d}", "nnP", "PcPsPiPl", MergeNone, "aarch64_sve_uqincp_n64">; +def SVQDECP_N_S32 : SInst<"svqdecp[_n_s32]_{d}", "kkP", "PcPsPiPl", MergeNone, "aarch64_sve_sqdecp_n32", [IsStreamingCompatible]>; +def SVQDECP_N_S64 : SInst<"svqdecp[_n_s64]_{d}", "llP", "PcPsPiPl", MergeNone, "aarch64_sve_sqdecp_n64", [IsStreamingCompatible]>; +def SVQDECP_N_U32 : SInst<"svqdecp[_n_u32]_{d}", "mmP", "PcPsPiPl", MergeNone, "aarch64_sve_uqdecp_n32", [IsStreamingCompatible]>; +def SVQDECP_N_U64 : SInst<"svqdecp[_n_u64]_{d}", "nnP", "PcPsPiPl", MergeNone, "aarch64_sve_uqdecp_n64", [IsStreamingCompatible]>; +def SVQINCP_N_S32 : SInst<"svqincp[_n_s32]_{d}", "kkP", "PcPsPiPl", MergeNone, "aarch64_sve_sqincp_n32", [IsStreamingCompatible]>; +def SVQINCP_N_S64 : SInst<"svqincp[_n_s64]_{d}", "llP", "PcPsPiPl", MergeNone, "aarch64_sve_sqincp_n64", [IsStreamingCompatible]>; +def SVQINCP_N_U32 : SInst<"svqincp[_n_u32]_{d}", "mmP", "PcPsPiPl", MergeNone, "aarch64_sve_uqincp_n32", [IsStreamingCompatible]>; +def SVQINCP_N_U64 : SInst<"svqincp[_n_u64]_{d}", "nnP", "PcPsPiPl", MergeNone, "aarch64_sve_uqincp_n64", [IsStreamingCompatible]>; let TargetGuard = "sve,i8mm" in { def SVMLLA_S32 : SInst<"svmmla[_s32]", "ddqq","i", MergeNone, "aarch64_sve_smmla">; def SVMLLA_U32 : SInst<"svmmla[_u32]", "ddqq","Ui", MergeNone, "aarch64_sve_ummla">; def SVUSMLLA_S32 : SInst<"svusmmla[_s32]", "ddbq","i", MergeNone, "aarch64_sve_usmmla">; -def SVUSDOT_S : SInst<"svusdot[_s32]", "ddbq", "i", MergeNone, "aarch64_sve_usdot">; -def SVUSDOT_N_S : SInst<"svusdot[_n_s32]", "ddbr", "i", MergeNone, "aarch64_sve_usdot">; -def SVSUDOT_S : SInst<"svsudot[_s32]", "ddqb", "i", MergeNone, "aarch64_sve_usdot", [ReverseUSDOT]>; -def SVSUDOT_N_S : SInst<"svsudot[_n_s32]", "ddq@", "i", MergeNone, "aarch64_sve_usdot", [ReverseUSDOT]>; +def SVUSDOT_S : SInst<"svusdot[_s32]", "ddbq", "i", MergeNone, "aarch64_sve_usdot", [IsStreamingCompatible]>; +def SVUSDOT_N_S : SInst<"svusdot[_n_s32]", "ddbr", "i", MergeNone, "aarch64_sve_usdot", [IsStreamingCompatible]>; +def SVSUDOT_S : SInst<"svsudot[_s32]", "ddqb", "i", MergeNone, "aarch64_sve_usdot", [ReverseUSDOT, IsStreamingCompatible]>; +def SVSUDOT_N_S : SInst<"svsudot[_n_s32]", "ddq@", "i", MergeNone, "aarch64_sve_usdot", [ReverseUSDOT, IsStreamingCompatible]>; -def SVUSDOT_LANE_S : SInst<"svusdot_lane[_s32]", "ddbqi", "i", MergeNone, "aarch64_sve_usdot_lane", [], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>; -def SVSUDOT_LANE_S : SInst<"svsudot_lane[_s32]", "ddqbi", "i", MergeNone, "aarch64_sve_sudot_lane", [], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>; +def SVUSDOT_LANE_S : SInst<"svusdot_lane[_s32]", "ddbqi", "i", MergeNone, "aarch64_sve_usdot_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>; +def SVSUDOT_LANE_S : SInst<"svsudot_lane[_s32]", "ddqbi", "i", MergeNone, "aarch64_sve_sudot_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>; } let TargetGuard = "sve,f32mm" in { @@ -1441,74 +1242,87 @@ def SVMLLA_F64 : SInst<"svmmla[_f64]", "dddd","d", MergeNone, "aarch64_sve_fmmla">; def SVTRN1Q : SInst<"svtrn1q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn1q">; def SVTRN2Q : SInst<"svtrn2q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn2q">; -def SVUZP1Q : SInst<"svuzp1q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp1q">; -def SVUZP2Q : SInst<"svuzp2q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp2q">; -def SVZIP1Q : SInst<"svzip1q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip1q">; -def SVZIP2Q : SInst<"svzip2q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip2q">; +def SVUZP1Q : SInst<"svuzp1q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp1q", [IsStreamingCompatible]>; +def SVUZP2Q : SInst<"svuzp2q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp2q", [IsStreamingCompatible]>; +def SVZIP1Q : SInst<"svzip1q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip1q", [IsStreamingCompatible]>; +def SVZIP2Q : SInst<"svzip2q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip2q", [IsStreamingCompatible]>; } let TargetGuard = "sve,bf16,f64mm" in { def SVTRN1Q_BF16 : SInst<"svtrn1q[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_trn1q">; def SVTRN2Q_BF16 : SInst<"svtrn2q[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_trn2q">; -def SVUZP1Q_BF16 : SInst<"svuzp1q[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_uzp1q">; -def SVUZP2Q_BF16 : SInst<"svuzp2q[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_uzp2q">; -def SVZIP1Q_BF16 : SInst<"svzip1q[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_zip1q">; -def SVZIP2Q_BF16 : SInst<"svzip2q[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_zip2q">; +def SVUZP1Q_BF16 : SInst<"svuzp1q[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_uzp1q", [IsStreamingCompatible]>; +def SVUZP2Q_BF16 : SInst<"svuzp2q[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_uzp2q", [IsStreamingCompatible]>; +def SVZIP1Q_BF16 : SInst<"svzip1q[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_zip1q", [IsStreamingCompatible]>; +def SVZIP2Q_BF16 : SInst<"svzip2q[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_zip2q", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// // Vector creation -def SVUNDEF_1 : SInst<"svundef_{d}", "dv", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef]>; -def SVUNDEF_2 : SInst<"svundef2_{d}", "2v", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef]>; -def SVUNDEF_3 : SInst<"svundef3_{d}", "3v", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef]>; -def SVUNDEF_4 : SInst<"svundef4_{d}", "4v", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef]>; +def SVUNDEF_1 : SInst<"svundef_{d}", "dv", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef, IsStreamingCompatible]>; +def SVUNDEF_2 : SInst<"svundef2_{d}", "2v", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef, IsStreamingCompatible]>; +def SVUNDEF_3 : SInst<"svundef3_{d}", "3v", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef, IsStreamingCompatible]>; +def SVUNDEF_4 : SInst<"svundef4_{d}", "4v", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef, IsStreamingCompatible]>; -def SVCREATE_2 : SInst<"svcreate2[_{d}]", "2dd", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleCreate]>; -def SVCREATE_3 : SInst<"svcreate3[_{d}]", "3ddd", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleCreate]>; -def SVCREATE_4 : SInst<"svcreate4[_{d}]", "4dddd", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleCreate]>; +def SVCREATE_2 : SInst<"svcreate2[_{d}]", "2dd", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleCreate, IsStreamingCompatible]>; +def SVCREATE_3 : SInst<"svcreate3[_{d}]", "3ddd", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleCreate, IsStreamingCompatible]>; +def SVCREATE_4 : SInst<"svcreate4[_{d}]", "4dddd", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleCreate, IsStreamingCompatible]>; let TargetGuard = "sve,bf16" in { -def SVUNDEF_1_BF16 : SInst<"svundef_{d}", "dv", "b", MergeNone, "", [IsUndef]>; -def SVUNDEF_2_BF16 : SInst<"svundef2_{d}", "2v", "b", MergeNone, "", [IsUndef]>; -def SVUNDEF_3_BF16 : SInst<"svundef3_{d}", "3v", "b", MergeNone, "", [IsUndef]>; -def SVUNDEF_4_BF16 : SInst<"svundef4_{d}", "4v", "b", MergeNone, "", [IsUndef]>; +def SVUNDEF_1_BF16 : SInst<"svundef_{d}", "dv", "b", MergeNone, "", [IsUndef, IsStreamingCompatible]>; +def SVUNDEF_2_BF16 : SInst<"svundef2_{d}", "2v", "b", MergeNone, "", [IsUndef, IsStreamingCompatible]>; +def SVUNDEF_3_BF16 : SInst<"svundef3_{d}", "3v", "b", MergeNone, "", [IsUndef, IsStreamingCompatible]>; +def SVUNDEF_4_BF16 : SInst<"svundef4_{d}", "4v", "b", MergeNone, "", [IsUndef, IsStreamingCompatible]>; -def SVCREATE_2_BF16 : SInst<"svcreate2[_{d}]", "2dd", "b", MergeNone, "", [IsTupleCreate]>; -def SVCREATE_3_BF16 : SInst<"svcreate3[_{d}]", "3ddd", "b", MergeNone, "", [IsTupleCreate]>; -def SVCREATE_4_BF16 : SInst<"svcreate4[_{d}]", "4dddd", "b", MergeNone, "", [IsTupleCreate]>; +def SVCREATE_2_BF16 : SInst<"svcreate2[_{d}]", "2dd", "b", MergeNone, "", [IsTupleCreate, IsStreamingCompatible]>; +def SVCREATE_3_BF16 : SInst<"svcreate3[_{d}]", "3ddd", "b", MergeNone, "", [IsTupleCreate, IsStreamingCompatible]>; +def SVCREATE_4_BF16 : SInst<"svcreate4[_{d}]", "4dddd", "b", MergeNone, "", [IsTupleCreate, IsStreamingCompatible]>; +} + +let TargetGuard = "sme2" in { + def SVCREATE_2_B : SInst<"svcreate2[_{d}]", "2dd", "Pc", MergeNone, "", [IsTupleCreate, IsStreamingCompatible]>; + def SVCREATE_4_B : SInst<"svcreate4[_{d}]", "4dddd", "Pc", MergeNone, "", [IsTupleCreate, IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// // Vector insertion and extraction -def SVGET_2 : SInst<"svget2[_{d}]", "d2i", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleGet], [ImmCheck<1, ImmCheck0_1>]>; -def SVGET_3 : SInst<"svget3[_{d}]", "d3i", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleGet], [ImmCheck<1, ImmCheck0_2>]>; -def SVGET_4 : SInst<"svget4[_{d}]", "d4i", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleGet], [ImmCheck<1, ImmCheck0_3>]>; +def SVGET_2 : SInst<"svget2[_{d}]", "d2i", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleGet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_1>]>; +def SVGET_3 : SInst<"svget3[_{d}]", "d3i", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleGet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_2>]>; +def SVGET_4 : SInst<"svget4[_{d}]", "d4i", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleGet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_3>]>; -def SVSET_2 : SInst<"svset2[_{d}]", "22id", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleSet], [ImmCheck<1, ImmCheck0_1>]>; -def SVSET_3 : SInst<"svset3[_{d}]", "33id", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleSet], [ImmCheck<1, ImmCheck0_2>]>; -def SVSET_4 : SInst<"svset4[_{d}]", "44id", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleSet], [ImmCheck<1, ImmCheck0_3>]>; +def SVSET_2 : SInst<"svset2[_{d}]", "22id", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleSet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_1>]>; +def SVSET_3 : SInst<"svset3[_{d}]", "33id", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleSet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_2>]>; +def SVSET_4 : SInst<"svset4[_{d}]", "44id", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleSet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_3>]>; let TargetGuard = "sve,bf16" in { -def SVGET_2_BF16 : SInst<"svget2[_{d}]", "d2i", "b", MergeNone, "", [IsTupleGet], [ImmCheck<1, ImmCheck0_1>]>; -def SVGET_3_BF16 : SInst<"svget3[_{d}]", "d3i", "b", MergeNone, "", [IsTupleGet], [ImmCheck<1, ImmCheck0_2>]>; -def SVGET_4_BF16 : SInst<"svget4[_{d}]", "d4i", "b", MergeNone, "", [IsTupleGet], [ImmCheck<1, ImmCheck0_3>]>; +def SVGET_2_BF16 : SInst<"svget2[_{d}]", "d2i", "b", MergeNone, "", [IsTupleGet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_1>]>; +def SVGET_3_BF16 : SInst<"svget3[_{d}]", "d3i", "b", MergeNone, "", [IsTupleGet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_2>]>; +def SVGET_4_BF16 : SInst<"svget4[_{d}]", "d4i", "b", MergeNone, "", [IsTupleGet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_3>]>; + +def SVSET_2_BF16 : SInst<"svset2[_{d}]", "22id", "b", MergeNone, "", [IsTupleSet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_1>]>; +def SVSET_3_BF16 : SInst<"svset3[_{d}]", "33id", "b", MergeNone, "", [IsTupleSet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_2>]>; +def SVSET_4_BF16 : SInst<"svset4[_{d}]", "44id", "b", MergeNone, "", [IsTupleSet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_3>]>; +} + +let TargetGuard = "sme2" in { + def SVGET_2_B : SInst<"svget2[_{d}]", "d2i", "Pc", MergeNone, "", [IsTupleGet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_1>]>; + def SVGET_4_B : SInst<"svget4[_{d}]", "d4i", "Pc", MergeNone, "", [IsTupleGet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_3>]>; -def SVSET_2_BF16 : SInst<"svset2[_{d}]", "22id", "b", MergeNone, "", [IsTupleSet], [ImmCheck<1, ImmCheck0_1>]>; -def SVSET_3_BF16 : SInst<"svset3[_{d}]", "33id", "b", MergeNone, "", [IsTupleSet], [ImmCheck<1, ImmCheck0_2>]>; -def SVSET_4_BF16 : SInst<"svset4[_{d}]", "44id", "b", MergeNone, "", [IsTupleSet], [ImmCheck<1, ImmCheck0_3>]>; + def SVSET_2_B : SInst<"svset2[_{d}]", "22id", "Pc", MergeNone, "", [IsTupleSet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_1>]>; + def SVSET_4_B : SInst<"svset4[_{d}]", "44id", "Pc", MergeNone, "", [IsTupleSet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_3>]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 WhileGE/GT let TargetGuard = "sve2" in { -def SVWHILEGE_S32 : SInst<"svwhilege_{d}[_{1}]", "Pkk", "PcPsPiPl", MergeNone, "aarch64_sve_whilege", [IsOverloadWhile]>; -def SVWHILEGE_S64 : SInst<"svwhilege_{d}[_{1}]", "Pll", "PcPsPiPl", MergeNone, "aarch64_sve_whilege", [IsOverloadWhile]>; -def SVWHILEGT_S32 : SInst<"svwhilegt_{d}[_{1}]", "Pkk", "PcPsPiPl", MergeNone, "aarch64_sve_whilegt", [IsOverloadWhile]>; -def SVWHILEGT_S64 : SInst<"svwhilegt_{d}[_{1}]", "Pll", "PcPsPiPl", MergeNone, "aarch64_sve_whilegt", [IsOverloadWhile]>; -def SVWHILEHI_U32 : SInst<"svwhilegt_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehi", [IsOverloadWhile]>; -def SVWHILEHI_U64 : SInst<"svwhilegt_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehi", [IsOverloadWhile]>; -def SVWHILEHS_U32 : SInst<"svwhilege_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehs", [IsOverloadWhile]>; -def SVWHILEHS_U64 : SInst<"svwhilege_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehs", [IsOverloadWhile]>; +def SVWHILEGE_S32 : SInst<"svwhilege_{d}[_{1}]", "Pkk", "PcPsPiPl", MergeNone, "aarch64_sve_whilege", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILEGE_S64 : SInst<"svwhilege_{d}[_{1}]", "Pll", "PcPsPiPl", MergeNone, "aarch64_sve_whilege", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILEGT_S32 : SInst<"svwhilegt_{d}[_{1}]", "Pkk", "PcPsPiPl", MergeNone, "aarch64_sve_whilegt", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILEGT_S64 : SInst<"svwhilegt_{d}[_{1}]", "Pll", "PcPsPiPl", MergeNone, "aarch64_sve_whilegt", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILEHI_U32 : SInst<"svwhilegt_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehi", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILEHI_U64 : SInst<"svwhilegt_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehi", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILEHS_U32 : SInst<"svwhilege_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehs", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILEHS_U64 : SInst<"svwhilege_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehs", [IsOverloadWhile, IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// @@ -1550,49 +1364,49 @@ } let TargetGuard = "sve2" in { -defm SVQRSHL_S : SInstZPZxZ<"svqrshl", "csil", "dPdx", "dPdK", "aarch64_sve_sqrshl">; -defm SVQRSHL_U : SInstZPZxZ<"svqrshl", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_uqrshl">; -defm SVQSHL_S : SInstZPZxZ<"svqshl", "csil", "dPdx", "dPdK", "aarch64_sve_sqshl">; -defm SVQSHL_U : SInstZPZxZ<"svqshl", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_uqshl">; -defm SVRSHL_S : SInstZPZxZ<"svrshl", "csil", "dPdx", "dPdK", "aarch64_sve_srshl">; -defm SVRSHL_U : SInstZPZxZ<"svrshl", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_urshl">; -defm SVSQADD : SInstZPZxZ<"svsqadd", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_usqadd">; -defm SVUQADD : SInstZPZxZ<"svuqadd", "csil", "dPdu", "dPdL", "aarch64_sve_suqadd">; - -def SVABA_S : SInst<"svaba[_{d}]", "dddd", "csil" , MergeNone, "aarch64_sve_saba">; -def SVABA_U : SInst<"svaba[_{d}]", "dddd", "UcUsUiUl", MergeNone, "aarch64_sve_uaba">; -def SVQDMULH : SInst<"svqdmulh[_{d}]", "ddd", "csil", MergeNone, "aarch64_sve_sqdmulh">; -def SVQRDMULH : SInst<"svqrdmulh[_{d}]", "ddd", "csil", MergeNone, "aarch64_sve_sqrdmulh">; -def SVQRDMLAH : SInst<"svqrdmlah[_{d}]", "dddd", "csil", MergeNone, "aarch64_sve_sqrdmlah">; -def SVQRDMLSH : SInst<"svqrdmlsh[_{d}]", "dddd", "csil", MergeNone, "aarch64_sve_sqrdmlsh">; - -def SVABA_S_N : SInst<"svaba[_n_{d}]", "ddda", "csil", MergeNone, "aarch64_sve_saba">; -def SVABA_U_N : SInst<"svaba[_n_{d}]", "ddda", "UcUsUiUl", MergeNone, "aarch64_sve_uaba">; -def SVQDMULH_N : SInst<"svqdmulh[_n_{d}]", "dda", "csil", MergeNone, "aarch64_sve_sqdmulh">; -def SVQRDMULH_N : SInst<"svqrdmulh[_n_{d}]", "dda", "csil", MergeNone, "aarch64_sve_sqrdmulh">; -def SVQRDMLAH_N : SInst<"svqrdmlah[_n_{d}]", "ddda", "csil", MergeNone, "aarch64_sve_sqrdmlah">; -def SVQRDMLSH_N : SInst<"svqrdmlsh[_n_{d}]", "ddda", "csil", MergeNone, "aarch64_sve_sqrdmlsh">; - -def SVQDMULH_LANE : SInst<"svqdmulh_lane[_{d}]", "dddi", "sil", MergeNone, "aarch64_sve_sqdmulh_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; -def SVQRDMULH_LANE : SInst<"svqrdmulh_lane[_{d}]", "dddi", "sil", MergeNone, "aarch64_sve_sqrdmulh_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; -def SVQRDMLAH_LANE : SInst<"svqrdmlah_lane[_{d}]", "ddddi", "sil", MergeNone, "aarch64_sve_sqrdmlah_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVQRDMLSH_LANE : SInst<"svqrdmlsh_lane[_{d}]", "ddddi", "sil", MergeNone, "aarch64_sve_sqrdmlsh_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; - -def SVQSHLU_M : SInst<"svqshlu[_n_{d}]", "uPdi", "csil", MergeOp1, "aarch64_sve_sqshlu", [], [ImmCheck<2, ImmCheckShiftLeft, 1>]>; -def SVQSHLU_X : SInst<"svqshlu[_n_{d}]", "uPdi", "csil", MergeAny, "aarch64_sve_sqshlu", [], [ImmCheck<2, ImmCheckShiftLeft, 1>]>; -def SVQSHLU_Z : SInst<"svqshlu[_n_{d}]", "uPdi", "csil", MergeZero, "aarch64_sve_sqshlu", [], [ImmCheck<2, ImmCheckShiftLeft, 1>]>; -def SVRSHR_M_S : SInst<"svrshr[_n_{d}]", "dPdi", "csil", MergeOp1, "aarch64_sve_srshr", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVRSHR_M_U : SInst<"svrshr[_n_{d}]", "dPdi", "UcUsUiUl", MergeOp1, "aarch64_sve_urshr", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVRSHR_X_S : SInst<"svrshr[_n_{d}]", "dPdi", "csil", MergeAny, "aarch64_sve_srshr", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVRSHR_X_U : SInst<"svrshr[_n_{d}]", "dPdi", "UcUsUiUl", MergeAny, "aarch64_sve_urshr", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVRSHR_Z_S : SInst<"svrshr[_n_{d}]", "dPdi", "csil", MergeZero, "aarch64_sve_srshr", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVRSHR_Z_U : SInst<"svrshr[_n_{d}]", "dPdi", "UcUsUiUl", MergeZero, "aarch64_sve_urshr", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVRSRA_S : SInst<"svrsra[_n_{d}]", "dddi", "csil", MergeNone, "aarch64_sve_srsra", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVRSRA_U : SInst<"svrsra[_n_{d}]", "dddi", "UcUsUiUl", MergeNone, "aarch64_sve_ursra", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVSLI : SInst<"svsli[_n_{d}]", "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_sli", [], [ImmCheck<2, ImmCheckShiftLeft, 1>]>; -def SVSRA_S : SInst<"svsra[_n_{d}]", "dddi", "csil", MergeNone, "aarch64_sve_ssra", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVSRA_U : SInst<"svsra[_n_{d}]", "dddi", "UcUsUiUl", MergeNone, "aarch64_sve_usra", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVSRI : SInst<"svsri[_n_{d}]", "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_sri", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +defm SVQRSHL_S : SInstZPZxZ<"svqrshl", "csil", "dPdx", "dPdK", "aarch64_sve_sqrshl", [IsStreamingCompatible]>; +defm SVQRSHL_U : SInstZPZxZ<"svqrshl", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_uqrshl", [IsStreamingCompatible]>; +defm SVQSHL_S : SInstZPZxZ<"svqshl", "csil", "dPdx", "dPdK", "aarch64_sve_sqshl", [IsStreamingCompatible]>; +defm SVQSHL_U : SInstZPZxZ<"svqshl", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_uqshl", [IsStreamingCompatible]>; +defm SVRSHL_S : SInstZPZxZ<"svrshl", "csil", "dPdx", "dPdK", "aarch64_sve_srshl", [IsStreamingCompatible]>; +defm SVRSHL_U : SInstZPZxZ<"svrshl", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_urshl", [IsStreamingCompatible]>; +defm SVSQADD : SInstZPZxZ<"svsqadd", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_usqadd", [IsStreamingCompatible]>; +defm SVUQADD : SInstZPZxZ<"svuqadd", "csil", "dPdu", "dPdL", "aarch64_sve_suqadd", [IsStreamingCompatible]>; + +def SVABA_S : SInst<"svaba[_{d}]", "dddd", "csil" , MergeNone, "aarch64_sve_saba", [IsStreamingCompatible]>; +def SVABA_U : SInst<"svaba[_{d}]", "dddd", "UcUsUiUl", MergeNone, "aarch64_sve_uaba", [IsStreamingCompatible]>; +def SVQDMULH : SInst<"svqdmulh[_{d}]", "ddd", "csil", MergeNone, "aarch64_sve_sqdmulh", [IsStreamingCompatible]>; +def SVQRDMULH : SInst<"svqrdmulh[_{d}]", "ddd", "csil", MergeNone, "aarch64_sve_sqrdmulh", [IsStreamingCompatible]>; +def SVQRDMLAH : SInst<"svqrdmlah[_{d}]", "dddd", "csil", MergeNone, "aarch64_sve_sqrdmlah", [IsStreamingCompatible]>; +def SVQRDMLSH : SInst<"svqrdmlsh[_{d}]", "dddd", "csil", MergeNone, "aarch64_sve_sqrdmlsh", [IsStreamingCompatible]>; + +def SVABA_S_N : SInst<"svaba[_n_{d}]", "ddda", "csil", MergeNone, "aarch64_sve_saba", [IsStreamingCompatible]>; +def SVABA_U_N : SInst<"svaba[_n_{d}]", "ddda", "UcUsUiUl", MergeNone, "aarch64_sve_uaba", [IsStreamingCompatible]>; +def SVQDMULH_N : SInst<"svqdmulh[_n_{d}]", "dda", "csil", MergeNone, "aarch64_sve_sqdmulh", [IsStreamingCompatible]>; +def SVQRDMULH_N : SInst<"svqrdmulh[_n_{d}]", "dda", "csil", MergeNone, "aarch64_sve_sqrdmulh", [IsStreamingCompatible]>; +def SVQRDMLAH_N : SInst<"svqrdmlah[_n_{d}]", "ddda", "csil", MergeNone, "aarch64_sve_sqrdmlah", [IsStreamingCompatible]>; +def SVQRDMLSH_N : SInst<"svqrdmlsh[_n_{d}]", "ddda", "csil", MergeNone, "aarch64_sve_sqrdmlsh", [IsStreamingCompatible]>; + +def SVQDMULH_LANE : SInst<"svqdmulh_lane[_{d}]", "dddi", "sil", MergeNone, "aarch64_sve_sqdmulh_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVQRDMULH_LANE : SInst<"svqrdmulh_lane[_{d}]", "dddi", "sil", MergeNone, "aarch64_sve_sqrdmulh_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVQRDMLAH_LANE : SInst<"svqrdmlah_lane[_{d}]", "ddddi", "sil", MergeNone, "aarch64_sve_sqrdmlah_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVQRDMLSH_LANE : SInst<"svqrdmlsh_lane[_{d}]", "ddddi", "sil", MergeNone, "aarch64_sve_sqrdmlsh_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; + +def SVQSHLU_M : SInst<"svqshlu[_n_{d}]", "uPdi", "csil", MergeOp1, "aarch64_sve_sqshlu", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftLeft, 1>]>; +def SVQSHLU_X : SInst<"svqshlu[_n_{d}]", "uPdi", "csil", MergeAny, "aarch64_sve_sqshlu", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftLeft, 1>]>; +def SVQSHLU_Z : SInst<"svqshlu[_n_{d}]", "uPdi", "csil", MergeZero, "aarch64_sve_sqshlu", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftLeft, 1>]>; +def SVRSHR_M_S : SInst<"svrshr[_n_{d}]", "dPdi", "csil", MergeOp1, "aarch64_sve_srshr", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVRSHR_M_U : SInst<"svrshr[_n_{d}]", "dPdi", "UcUsUiUl", MergeOp1, "aarch64_sve_urshr", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVRSHR_X_S : SInst<"svrshr[_n_{d}]", "dPdi", "csil", MergeAny, "aarch64_sve_srshr", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVRSHR_X_U : SInst<"svrshr[_n_{d}]", "dPdi", "UcUsUiUl", MergeAny, "aarch64_sve_urshr", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVRSHR_Z_S : SInst<"svrshr[_n_{d}]", "dPdi", "csil", MergeZero, "aarch64_sve_srshr", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVRSHR_Z_U : SInst<"svrshr[_n_{d}]", "dPdi", "UcUsUiUl", MergeZero, "aarch64_sve_urshr", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVRSRA_S : SInst<"svrsra[_n_{d}]", "dddi", "csil", MergeNone, "aarch64_sve_srsra", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVRSRA_U : SInst<"svrsra[_n_{d}]", "dddi", "UcUsUiUl", MergeNone, "aarch64_sve_ursra", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVSLI : SInst<"svsli[_n_{d}]", "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_sli", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftLeft, 1>]>; +def SVSRA_S : SInst<"svsra[_n_{d}]", "dddi", "csil", MergeNone, "aarch64_sve_ssra", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVSRA_U : SInst<"svsra[_n_{d}]", "dddi", "UcUsUiUl", MergeNone, "aarch64_sve_usra", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVSRI : SInst<"svsri[_n_{d}]", "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_sri", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; } //////////////////////////////////////////////////////////////////////////////// @@ -1604,29 +1418,29 @@ } let TargetGuard = "sve2" in { -defm SVADDP : SInstPairwise<"svaddp", "csliUcUsUiUl", "aarch64_sve_addp">; -defm SVADDP_F : SInstPairwise<"svaddp", "hfd", "aarch64_sve_faddp">; -defm SVMAXNMP : SInstPairwise<"svmaxnmp", "hfd", "aarch64_sve_fmaxnmp">; -defm SVMAXP_F : SInstPairwise<"svmaxp", "hfd", "aarch64_sve_fmaxp">; -defm SVMAXP_S : SInstPairwise<"svmaxp", "csli", "aarch64_sve_smaxp">; -defm SVMAXP_U : SInstPairwise<"svmaxp", "UcUsUiUl", "aarch64_sve_umaxp">; -defm SVMINNMP : SInstPairwise<"svminnmp", "hfd", "aarch64_sve_fminnmp">; -defm SVMINP_F : SInstPairwise<"svminp", "hfd", "aarch64_sve_fminp">; -defm SVMINP_S : SInstPairwise<"svminp", "csli", "aarch64_sve_sminp">; -defm SVMINP_U : SInstPairwise<"svminp", "UcUsUiUl", "aarch64_sve_uminp">; +defm SVADDP : SInstPairwise<"svaddp", "csliUcUsUiUl", "aarch64_sve_addp", [IsStreamingCompatible]>; +defm SVADDP_F : SInstPairwise<"svaddp", "hfd", "aarch64_sve_faddp", [IsStreamingCompatible]>; +defm SVMAXNMP : SInstPairwise<"svmaxnmp", "hfd", "aarch64_sve_fmaxnmp", [IsStreamingCompatible]>; +defm SVMAXP_F : SInstPairwise<"svmaxp", "hfd", "aarch64_sve_fmaxp", [IsStreamingCompatible]>; +defm SVMAXP_S : SInstPairwise<"svmaxp", "csli", "aarch64_sve_smaxp", [IsStreamingCompatible]>; +defm SVMAXP_U : SInstPairwise<"svmaxp", "UcUsUiUl", "aarch64_sve_umaxp", [IsStreamingCompatible]>; +defm SVMINNMP : SInstPairwise<"svminnmp", "hfd", "aarch64_sve_fminnmp", [IsStreamingCompatible]>; +defm SVMINP_F : SInstPairwise<"svminp", "hfd", "aarch64_sve_fminp", [IsStreamingCompatible]>; +defm SVMINP_S : SInstPairwise<"svminp", "csli", "aarch64_sve_sminp", [IsStreamingCompatible]>; +defm SVMINP_U : SInstPairwise<"svminp", "UcUsUiUl", "aarch64_sve_uminp", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 - Widening pairwise arithmetic let TargetGuard = "sve2" in { -def SVADALP_S_M : SInst<"svadalp[_{d}]", "dPdh", "sil", MergeOp1, "aarch64_sve_sadalp">; -def SVADALP_S_X : SInst<"svadalp[_{d}]", "dPdh", "sil", MergeAny, "aarch64_sve_sadalp">; -def SVADALP_S_Z : SInst<"svadalp[_{d}]", "dPdh", "sil", MergeZero, "aarch64_sve_sadalp">; +def SVADALP_S_M : SInst<"svadalp[_{d}]", "dPdh", "sil", MergeOp1, "aarch64_sve_sadalp", [IsStreamingCompatible]>; +def SVADALP_S_X : SInst<"svadalp[_{d}]", "dPdh", "sil", MergeAny, "aarch64_sve_sadalp", [IsStreamingCompatible]>; +def SVADALP_S_Z : SInst<"svadalp[_{d}]", "dPdh", "sil", MergeZero, "aarch64_sve_sadalp", [IsStreamingCompatible]>; -def SVADALP_U_M : SInst<"svadalp[_{d}]", "dPdh", "UsUiUl", MergeOp1, "aarch64_sve_uadalp">; -def SVADALP_U_X : SInst<"svadalp[_{d}]", "dPdh", "UsUiUl", MergeAny, "aarch64_sve_uadalp">; -def SVADALP_U_Z : SInst<"svadalp[_{d}]", "dPdh", "UsUiUl", MergeZero, "aarch64_sve_uadalp">; +def SVADALP_U_M : SInst<"svadalp[_{d}]", "dPdh", "UsUiUl", MergeOp1, "aarch64_sve_uadalp", [IsStreamingCompatible]>; +def SVADALP_U_X : SInst<"svadalp[_{d}]", "dPdh", "UsUiUl", MergeAny, "aarch64_sve_uadalp", [IsStreamingCompatible]>; +def SVADALP_U_Z : SInst<"svadalp[_{d}]", "dPdh", "UsUiUl", MergeZero, "aarch64_sve_uadalp", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// @@ -1634,56 +1448,56 @@ // let TargetGuard = "sve2" in { -def SVBCAX : SInst<"svbcax[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bcax">; -def SVBSL : SInst<"svbsl[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl">; -def SVBSL1N : SInst<"svbsl1n[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl1n">; -def SVBSL2N : SInst<"svbsl2n[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl2n">; -def SVEOR3 : SInst<"sveor3[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eor3">; -def SVNBSL : SInst<"svnbsl[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_nbsl">; - -def SVBCAX_N : SInst<"svbcax[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bcax">; -def SVBSL_N : SInst<"svbsl[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl">; -def SVBSL1N_N : SInst<"svbsl1n[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl1n">; -def SVBSL2N_N : SInst<"svbsl2n[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl2n">; -def SVEOR3_N : SInst<"sveor3[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_eor3">; -def SVNBSL_N : SInst<"svnbsl[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_nbsl">; -def SVXAR_N : SInst<"svxar[_n_{d}]", "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_xar", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVBCAX : SInst<"svbcax[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bcax", [IsStreamingCompatible]>; +def SVBSL : SInst<"svbsl[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl", [IsStreamingCompatible]>; +def SVBSL1N : SInst<"svbsl1n[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl1n", [IsStreamingCompatible]>; +def SVBSL2N : SInst<"svbsl2n[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl2n", [IsStreamingCompatible]>; +def SVEOR3 : SInst<"sveor3[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eor3", [IsStreamingCompatible]>; +def SVNBSL : SInst<"svnbsl[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_nbsl", [IsStreamingCompatible]>; + +def SVBCAX_N : SInst<"svbcax[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bcax", [IsStreamingCompatible]>; +def SVBSL_N : SInst<"svbsl[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl", [IsStreamingCompatible]>; +def SVBSL1N_N : SInst<"svbsl1n[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl1n", [IsStreamingCompatible]>; +def SVBSL2N_N : SInst<"svbsl2n[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl2n", [IsStreamingCompatible]>; +def SVEOR3_N : SInst<"sveor3[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_eor3", [IsStreamingCompatible]>; +def SVNBSL_N : SInst<"svnbsl[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_nbsl", [IsStreamingCompatible]>; +def SVXAR_N : SInst<"svxar[_n_{d}]", "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_xar", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 - Large integer arithmetic let TargetGuard = "sve2" in { -def SVADCLB : SInst<"svadclb[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_adclb">; -def SVADCLT : SInst<"svadclt[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_adclt">; -def SVSBCLB : SInst<"svsbclb[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_sbclb">; -def SVSBCLT : SInst<"svsbclt[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_sbclt">; +def SVADCLB : SInst<"svadclb[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_adclb", [IsStreamingCompatible]>; +def SVADCLT : SInst<"svadclt[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_adclt", [IsStreamingCompatible]>; +def SVSBCLB : SInst<"svsbclb[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_sbclb", [IsStreamingCompatible]>; +def SVSBCLT : SInst<"svsbclt[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_sbclt", [IsStreamingCompatible]>; -def SVADCLB_N : SInst<"svadclb[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_adclb">; -def SVADCLT_N : SInst<"svadclt[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_adclt">; -def SVSBCLB_N : SInst<"svsbclb[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_sbclb">; -def SVSBCLT_N : SInst<"svsbclt[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_sbclt">; +def SVADCLB_N : SInst<"svadclb[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_adclb", [IsStreamingCompatible]>; +def SVADCLT_N : SInst<"svadclt[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_adclt", [IsStreamingCompatible]>; +def SVSBCLB_N : SInst<"svsbclb[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_sbclb", [IsStreamingCompatible]>; +def SVSBCLT_N : SInst<"svsbclt[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_sbclt", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 - Multiplication by indexed elements let TargetGuard = "sve2" in { -def SVMLA_LANE_2 : SInst<"svmla_lane[_{d}]", "ddddi", "silUsUiUl", MergeNone, "aarch64_sve_mla_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLS_LANE_2 : SInst<"svmls_lane[_{d}]", "ddddi", "silUsUiUl", MergeNone, "aarch64_sve_mls_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMUL_LANE_2 : SInst<"svmul_lane[_{d}]", "dddi", "silUsUiUl", MergeNone, "aarch64_sve_mul_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVMLA_LANE_2 : SInst<"svmla_lane[_{d}]", "ddddi", "silUsUiUl", MergeNone, "aarch64_sve_mla_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLS_LANE_2 : SInst<"svmls_lane[_{d}]", "ddddi", "silUsUiUl", MergeNone, "aarch64_sve_mls_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMUL_LANE_2 : SInst<"svmul_lane[_{d}]", "dddi", "silUsUiUl", MergeNone, "aarch64_sve_mul_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 - Uniform complex integer arithmetic let TargetGuard = "sve2" in { -def SVCADD : SInst<"svcadd[_{d}]", "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_cadd_x", [], [ImmCheck<2, ImmCheckComplexRot90_270>]>; -def SVSQCADD : SInst<"svqcadd[_{d}]", "dddi", "csil", MergeNone, "aarch64_sve_sqcadd_x", [], [ImmCheck<2, ImmCheckComplexRot90_270>]>; -def SVCMLA : SInst<"svcmla[_{d}]", "ddddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmla_x", [], [ImmCheck<3, ImmCheckComplexRotAll90>]>; -def SVCMLA_LANE_X : SInst<"svcmla_lane[_{d}]", "ddddii", "siUsUi", MergeNone, "aarch64_sve_cmla_lane_x", [], [ImmCheck<3, ImmCheckLaneIndexCompRotate, 2>, +def SVCADD : SInst<"svcadd[_{d}]", "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_cadd_x", [IsStreamingCompatible], [ImmCheck<2, ImmCheckComplexRot90_270>]>; +def SVSQCADD : SInst<"svqcadd[_{d}]", "dddi", "csil", MergeNone, "aarch64_sve_sqcadd_x", [IsStreamingCompatible], [ImmCheck<2, ImmCheckComplexRot90_270>]>; +def SVCMLA : SInst<"svcmla[_{d}]", "ddddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmla_x", [IsStreamingCompatible], [ImmCheck<3, ImmCheckComplexRotAll90>]>; +def SVCMLA_LANE_X : SInst<"svcmla_lane[_{d}]", "ddddii", "siUsUi", MergeNone, "aarch64_sve_cmla_lane_x", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndexCompRotate, 2>, ImmCheck<4, ImmCheckComplexRotAll90>]>; -def SVSQRDCMLAH_X : SInst<"svqrdcmlah[_{d}]", "ddddi", "csil", MergeNone, "aarch64_sve_sqrdcmlah_x", [], [ImmCheck<3, ImmCheckComplexRotAll90>]>; -def SVSQRDCMLAH_LANE_X : SInst<"svqrdcmlah_lane[_{d}]", "ddddii", "si", MergeNone, "aarch64_sve_sqrdcmlah_lane_x", [], [ImmCheck<3, ImmCheckLaneIndexCompRotate, 2>, +def SVSQRDCMLAH_X : SInst<"svqrdcmlah[_{d}]", "ddddi", "csil", MergeNone, "aarch64_sve_sqrdcmlah_x", [IsStreamingCompatible], [ImmCheck<3, ImmCheckComplexRotAll90>]>; +def SVSQRDCMLAH_LANE_X : SInst<"svqrdcmlah_lane[_{d}]", "ddddii", "si", MergeNone, "aarch64_sve_sqrdcmlah_lane_x", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndexCompRotate, 2>, ImmCheck<4, ImmCheckComplexRotAll90>]>; } @@ -1691,18 +1505,18 @@ // SVE2 - Widening DSP operations multiclass SInstWideDSPAcc { - def : SInst; - def _N : SInst; + def : SInst; + def _N : SInst; } multiclass SInstWideDSPLong { - def : SInst; - def _N : SInst; + def : SInst; + def _N : SInst; } multiclass SInstWideDSPWide { - def : SInst; - def _N : SInst; + def : SInst; + def _N : SInst; } let TargetGuard = "sve2" in { @@ -1751,87 +1565,87 @@ defm SVSUBWT_S : SInstWideDSPWide<"svsubwt", "sil", "aarch64_sve_ssubwt">; defm SVSUBWT_U : SInstWideDSPWide<"svsubwt", "UsUiUl", "aarch64_sve_usubwt">; -def SVSHLLB_S_N : SInst<"svshllb[_n_{d}]", "dhi", "sil", MergeNone, "aarch64_sve_sshllb", [], [ImmCheck<1, ImmCheckShiftLeft, 0>]>; -def SVSHLLB_U_N : SInst<"svshllb[_n_{d}]", "dhi", "UsUiUl", MergeNone, "aarch64_sve_ushllb", [], [ImmCheck<1, ImmCheckShiftLeft, 0>]>; -def SVSHLLT_S_N : SInst<"svshllt[_n_{d}]", "dhi", "sil", MergeNone, "aarch64_sve_sshllt", [], [ImmCheck<1, ImmCheckShiftLeft, 0>]>; -def SVSHLLT_U_N : SInst<"svshllt[_n_{d}]", "dhi", "UsUiUl", MergeNone, "aarch64_sve_ushllt", [], [ImmCheck<1, ImmCheckShiftLeft, 0>]>; - -def SVMOVLB_S_N : SInst<"svmovlb[_{d}]", "dh", "sil", MergeNone>; -def SVMOVLB_U_N : SInst<"svmovlb[_{d}]", "dh", "UsUiUl", MergeNone>; -def SVMOVLT_S_N : SInst<"svmovlt[_{d}]", "dh", "sil", MergeNone>; -def SVMOVLT_U_N : SInst<"svmovlt[_{d}]", "dh", "UsUiUl", MergeNone>; - -def SVMLALB_S_LANE : SInst<"svmlalb_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_smlalb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLALB_U_LANE : SInst<"svmlalb_lane[_{d}]", "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlalb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLALT_S_LANE : SInst<"svmlalt_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_smlalt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLALT_U_LANE : SInst<"svmlalt_lane[_{d}]", "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlalt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLSLB_S_LANE : SInst<"svmlslb_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_smlslb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLSLB_U_LANE : SInst<"svmlslb_lane[_{d}]", "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlslb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLSLT_S_LANE : SInst<"svmlslt_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_smlslt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLSLT_U_LANE : SInst<"svmlslt_lane[_{d}]", "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlslt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMULLB_S_LANE : SInst<"svmullb_lane[_{d}]", "dhhi", "il", MergeNone, "aarch64_sve_smullb_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; -def SVMULLB_U_LANE : SInst<"svmullb_lane[_{d}]", "dhhi", "UiUl", MergeNone, "aarch64_sve_umullb_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; -def SVMULLT_S_LANE : SInst<"svmullt_lane[_{d}]", "dhhi", "il", MergeNone, "aarch64_sve_smullt_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; -def SVMULLT_U_LANE : SInst<"svmullt_lane[_{d}]", "dhhi", "UiUl", MergeNone, "aarch64_sve_umullt_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; -def SVQDMLALB_LANE : SInst<"svqdmlalb_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_sqdmlalb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVQDMLALT_LANE : SInst<"svqdmlalt_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_sqdmlalt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVQDMLSLB_LANE : SInst<"svqdmlslb_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_sqdmlslb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVQDMLSLT_LANE : SInst<"svqdmlslt_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_sqdmlslt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVQDMULLB_LANE : SInst<"svqdmullb_lane[_{d}]", "dhhi", "il", MergeNone, "aarch64_sve_sqdmullb_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; -def SVQDMULLT_LANE : SInst<"svqdmullt_lane[_{d}]", "dhhi", "il", MergeNone, "aarch64_sve_sqdmullt_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVSHLLB_S_N : SInst<"svshllb[_n_{d}]", "dhi", "sil", MergeNone, "aarch64_sve_sshllb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftLeft, 0>]>; +def SVSHLLB_U_N : SInst<"svshllb[_n_{d}]", "dhi", "UsUiUl", MergeNone, "aarch64_sve_ushllb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftLeft, 0>]>; +def SVSHLLT_S_N : SInst<"svshllt[_n_{d}]", "dhi", "sil", MergeNone, "aarch64_sve_sshllt", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftLeft, 0>]>; +def SVSHLLT_U_N : SInst<"svshllt[_n_{d}]", "dhi", "UsUiUl", MergeNone, "aarch64_sve_ushllt", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftLeft, 0>]>; + +def SVMOVLB_S_N : SInst<"svmovlb[_{d}]", "dh", "sil", MergeNone, "", [IsStreamingCompatible]>; +def SVMOVLB_U_N : SInst<"svmovlb[_{d}]", "dh", "UsUiUl", MergeNone, "", [IsStreamingCompatible]>; +def SVMOVLT_S_N : SInst<"svmovlt[_{d}]", "dh", "sil", MergeNone, "", [IsStreamingCompatible]>; +def SVMOVLT_U_N : SInst<"svmovlt[_{d}]", "dh", "UsUiUl", MergeNone, "", [IsStreamingCompatible]>; + +def SVMLALB_S_LANE : SInst<"svmlalb_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_smlalb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLALB_U_LANE : SInst<"svmlalb_lane[_{d}]", "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlalb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLALT_S_LANE : SInst<"svmlalt_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_smlalt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLALT_U_LANE : SInst<"svmlalt_lane[_{d}]", "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlalt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLSLB_S_LANE : SInst<"svmlslb_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_smlslb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLSLB_U_LANE : SInst<"svmlslb_lane[_{d}]", "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlslb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLSLT_S_LANE : SInst<"svmlslt_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_smlslt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLSLT_U_LANE : SInst<"svmlslt_lane[_{d}]", "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlslt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMULLB_S_LANE : SInst<"svmullb_lane[_{d}]", "dhhi", "il", MergeNone, "aarch64_sve_smullb_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVMULLB_U_LANE : SInst<"svmullb_lane[_{d}]", "dhhi", "UiUl", MergeNone, "aarch64_sve_umullb_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVMULLT_S_LANE : SInst<"svmullt_lane[_{d}]", "dhhi", "il", MergeNone, "aarch64_sve_smullt_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVMULLT_U_LANE : SInst<"svmullt_lane[_{d}]", "dhhi", "UiUl", MergeNone, "aarch64_sve_umullt_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVQDMLALB_LANE : SInst<"svqdmlalb_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_sqdmlalb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVQDMLALT_LANE : SInst<"svqdmlalt_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_sqdmlalt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVQDMLSLB_LANE : SInst<"svqdmlslb_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_sqdmlslb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVQDMLSLT_LANE : SInst<"svqdmlslt_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_sqdmlslt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVQDMULLB_LANE : SInst<"svqdmullb_lane[_{d}]", "dhhi", "il", MergeNone, "aarch64_sve_sqdmullb_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVQDMULLT_LANE : SInst<"svqdmullt_lane[_{d}]", "dhhi", "il", MergeNone, "aarch64_sve_sqdmullt_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 - Narrowing DSP operations let TargetGuard = "sve2" in { -def SVADDHNB : SInst<"svaddhnb[_{d}]", "hdd", "silUsUiUl", MergeNone, "aarch64_sve_addhnb">; -def SVADDHNT : SInst<"svaddhnt[_{d}]", "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_addhnt">; -def SVRADDHNB : SInst<"svraddhnb[_{d}]", "hdd", "silUsUiUl", MergeNone, "aarch64_sve_raddhnb">; -def SVRADDHNT : SInst<"svraddhnt[_{d}]", "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_raddhnt">; -def SVRSUBHNB : SInst<"svrsubhnb[_{d}]", "hdd", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnb">; -def SVRSUBHNT : SInst<"svrsubhnt[_{d}]", "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnt">; -def SVSUBHNB : SInst<"svsubhnb[_{d}]", "hdd", "silUsUiUl", MergeNone, "aarch64_sve_subhnb">; -def SVSUBHNT : SInst<"svsubhnt[_{d}]", "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_subhnt">; - -def SVADDHNB_N : SInst<"svaddhnb[_n_{d}]", "hda", "silUsUiUl", MergeNone, "aarch64_sve_addhnb">; -def SVADDHNT_N : SInst<"svaddhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_addhnt">; -def SVRADDHNB_N : SInst<"svraddhnb[_n_{d}]", "hda", "silUsUiUl", MergeNone, "aarch64_sve_raddhnb">; -def SVRADDHNT_N : SInst<"svraddhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_raddhnt">; -def SVRSUBHNB_N : SInst<"svrsubhnb[_n_{d}]", "hda", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnb">; -def SVRSUBHNT_N : SInst<"svrsubhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnt">; -def SVSUBHNB_N : SInst<"svsubhnb[_n_{d}]", "hda", "silUsUiUl", MergeNone, "aarch64_sve_subhnb">; -def SVSUBHNT_N : SInst<"svsubhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_subhnt">; - -def SVSHRNB : SInst<"svshrnb[_n_{d}]", "hdi", "silUsUiUl", MergeNone, "aarch64_sve_shrnb", [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; -def SVRSHRNB : SInst<"svrshrnb[_n_{d}]", "hdi", "silUsUiUl", MergeNone, "aarch64_sve_rshrnb", [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; -def SVQSHRUNB : SInst<"svqshrunb[_n_{d}]", "edi", "sil", MergeNone, "aarch64_sve_sqshrunb", [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; -def SVQRSHRUNB : SInst<"svqrshrunb[_n_{d}]", "edi", "sil", MergeNone, "aarch64_sve_sqrshrunb", [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; -def SVQSHRNB_S : SInst<"svqshrnb[_n_{d}]", "hdi", "sil", MergeNone, "aarch64_sve_sqshrnb", [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; -def SVQSHRNB_U : SInst<"svqshrnb[_n_{d}]", "hdi", "UsUiUl", MergeNone, "aarch64_sve_uqshrnb", [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; -def SVQRSHRNB_S : SInst<"svqrshrnb[_n_{d}]", "hdi", "sil", MergeNone, "aarch64_sve_sqrshrnb", [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; -def SVQRSHRNB_U : SInst<"svqrshrnb[_n_{d}]", "hdi", "UsUiUl", MergeNone, "aarch64_sve_uqrshrnb", [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; - -def SVSHRNT : SInst<"svshrnt[_n_{d}]", "hhdi", "silUsUiUl", MergeNone, "aarch64_sve_shrnt", [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; -def SVRSHRNT : SInst<"svrshrnt[_n_{d}]", "hhdi", "silUsUiUl", MergeNone, "aarch64_sve_rshrnt", [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; -def SVQSHRUNT : SInst<"svqshrunt[_n_{d}]", "eedi", "sil", MergeNone, "aarch64_sve_sqshrunt", [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; -def SVQRSHRUNT : SInst<"svqrshrunt[_n_{d}]", "eedi", "sil", MergeNone, "aarch64_sve_sqrshrunt", [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; -def SVQSHRNT_S : SInst<"svqshrnt[_n_{d}]", "hhdi", "sil", MergeNone, "aarch64_sve_sqshrnt", [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; -def SVQSHRNT_U : SInst<"svqshrnt[_n_{d}]", "hhdi", "UsUiUl", MergeNone, "aarch64_sve_uqshrnt", [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; -def SVQRSHRNT_S : SInst<"svqrshrnt[_n_{d}]", "hhdi", "sil", MergeNone, "aarch64_sve_sqrshrnt", [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; -def SVQRSHRNT_U : SInst<"svqrshrnt[_n_{d}]", "hhdi", "UsUiUl", MergeNone, "aarch64_sve_uqrshrnt", [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; +def SVADDHNB : SInst<"svaddhnb[_{d}]", "hdd", "silUsUiUl", MergeNone, "aarch64_sve_addhnb", [IsStreamingCompatible]>; +def SVADDHNT : SInst<"svaddhnt[_{d}]", "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_addhnt", [IsStreamingCompatible]>; +def SVRADDHNB : SInst<"svraddhnb[_{d}]", "hdd", "silUsUiUl", MergeNone, "aarch64_sve_raddhnb", [IsStreamingCompatible]>; +def SVRADDHNT : SInst<"svraddhnt[_{d}]", "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_raddhnt", [IsStreamingCompatible]>; +def SVRSUBHNB : SInst<"svrsubhnb[_{d}]", "hdd", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnb", [IsStreamingCompatible]>; +def SVRSUBHNT : SInst<"svrsubhnt[_{d}]", "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnt", [IsStreamingCompatible]>; +def SVSUBHNB : SInst<"svsubhnb[_{d}]", "hdd", "silUsUiUl", MergeNone, "aarch64_sve_subhnb", [IsStreamingCompatible]>; +def SVSUBHNT : SInst<"svsubhnt[_{d}]", "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_subhnt", [IsStreamingCompatible]>; + +def SVADDHNB_N : SInst<"svaddhnb[_n_{d}]", "hda", "silUsUiUl", MergeNone, "aarch64_sve_addhnb", [IsStreamingCompatible]>; +def SVADDHNT_N : SInst<"svaddhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_addhnt", [IsStreamingCompatible]>; +def SVRADDHNB_N : SInst<"svraddhnb[_n_{d}]", "hda", "silUsUiUl", MergeNone, "aarch64_sve_raddhnb", [IsStreamingCompatible]>; +def SVRADDHNT_N : SInst<"svraddhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_raddhnt", [IsStreamingCompatible]>; +def SVRSUBHNB_N : SInst<"svrsubhnb[_n_{d}]", "hda", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnb", [IsStreamingCompatible]>; +def SVRSUBHNT_N : SInst<"svrsubhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnt", [IsStreamingCompatible]>; +def SVSUBHNB_N : SInst<"svsubhnb[_n_{d}]", "hda", "silUsUiUl", MergeNone, "aarch64_sve_subhnb", [IsStreamingCompatible]>; +def SVSUBHNT_N : SInst<"svsubhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_subhnt", [IsStreamingCompatible]>; + +def SVSHRNB : SInst<"svshrnb[_n_{d}]", "hdi", "silUsUiUl", MergeNone, "aarch64_sve_shrnb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; +def SVRSHRNB : SInst<"svrshrnb[_n_{d}]", "hdi", "silUsUiUl", MergeNone, "aarch64_sve_rshrnb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; +def SVQSHRUNB : SInst<"svqshrunb[_n_{d}]", "edi", "sil", MergeNone, "aarch64_sve_sqshrunb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; +def SVQRSHRUNB : SInst<"svqrshrunb[_n_{d}]", "edi", "sil", MergeNone, "aarch64_sve_sqrshrunb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; +def SVQSHRNB_S : SInst<"svqshrnb[_n_{d}]", "hdi", "sil", MergeNone, "aarch64_sve_sqshrnb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; +def SVQSHRNB_U : SInst<"svqshrnb[_n_{d}]", "hdi", "UsUiUl", MergeNone, "aarch64_sve_uqshrnb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; +def SVQRSHRNB_S : SInst<"svqrshrnb[_n_{d}]", "hdi", "sil", MergeNone, "aarch64_sve_sqrshrnb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; +def SVQRSHRNB_U : SInst<"svqrshrnb[_n_{d}]", "hdi", "UsUiUl", MergeNone, "aarch64_sve_uqrshrnb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; + +def SVSHRNT : SInst<"svshrnt[_n_{d}]", "hhdi", "silUsUiUl", MergeNone, "aarch64_sve_shrnt", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; +def SVRSHRNT : SInst<"svrshrnt[_n_{d}]", "hhdi", "silUsUiUl", MergeNone, "aarch64_sve_rshrnt", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; +def SVQSHRUNT : SInst<"svqshrunt[_n_{d}]", "eedi", "sil", MergeNone, "aarch64_sve_sqshrunt", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; +def SVQRSHRUNT : SInst<"svqrshrunt[_n_{d}]", "eedi", "sil", MergeNone, "aarch64_sve_sqrshrunt", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; +def SVQSHRNT_S : SInst<"svqshrnt[_n_{d}]", "hhdi", "sil", MergeNone, "aarch64_sve_sqshrnt", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; +def SVQSHRNT_U : SInst<"svqshrnt[_n_{d}]", "hhdi", "UsUiUl", MergeNone, "aarch64_sve_uqshrnt", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; +def SVQRSHRNT_S : SInst<"svqrshrnt[_n_{d}]", "hhdi", "sil", MergeNone, "aarch64_sve_sqrshrnt", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; +def SVQRSHRNT_U : SInst<"svqrshrnt[_n_{d}]", "hhdi", "UsUiUl", MergeNone, "aarch64_sve_uqrshrnt", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 - Unary narrowing operations let TargetGuard = "sve2" in { -def SVQXTNB_S : SInst<"svqxtnb[_{d}]", "hd", "sil", MergeNone, "aarch64_sve_sqxtnb">; -def SVQXTNB_U : SInst<"svqxtnb[_{d}]", "hd", "UsUiUl", MergeNone, "aarch64_sve_uqxtnb">; -def SVQXTUNB_S : SInst<"svqxtunb[_{d}]", "ed", "sil", MergeNone, "aarch64_sve_sqxtunb">; +def SVQXTNB_S : SInst<"svqxtnb[_{d}]", "hd", "sil", MergeNone, "aarch64_sve_sqxtnb", [IsStreamingCompatible]>; +def SVQXTNB_U : SInst<"svqxtnb[_{d}]", "hd", "UsUiUl", MergeNone, "aarch64_sve_uqxtnb", [IsStreamingCompatible]>; +def SVQXTUNB_S : SInst<"svqxtunb[_{d}]", "ed", "sil", MergeNone, "aarch64_sve_sqxtunb", [IsStreamingCompatible]>; -def SVQXTNT_S : SInst<"svqxtnt[_{d}]", "hhd", "sil", MergeNone, "aarch64_sve_sqxtnt">; -def SVQXTNT_U : SInst<"svqxtnt[_{d}]", "hhd", "UsUiUl", MergeNone, "aarch64_sve_uqxtnt">; -def SVQXTUNT_S : SInst<"svqxtunt[_{d}]", "eed", "sil", MergeNone, "aarch64_sve_sqxtunt">; +def SVQXTNT_S : SInst<"svqxtnt[_{d}]", "hhd", "sil", MergeNone, "aarch64_sve_sqxtnt", [IsStreamingCompatible]>; +def SVQXTNT_U : SInst<"svqxtnt[_{d}]", "hhd", "UsUiUl", MergeNone, "aarch64_sve_uqxtnt", [IsStreamingCompatible]>; +def SVQXTUNT_S : SInst<"svqxtunt[_{d}]", "eed", "sil", MergeNone, "aarch64_sve_sqxtunt", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// @@ -1972,28 +1786,28 @@ // SVE2 - Polynomial arithmetic let TargetGuard = "sve2" in { -def SVEORBT : SInst<"sveorbt[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorbt">; -def SVEORBT_N : SInst<"sveorbt[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorbt">; -def SVEORTB : SInst<"sveortb[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eortb">; -def SVEORTB_N : SInst<"sveortb[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_eortb">; -def SVPMUL : SInst<"svpmul[_{d}]", "ddd", "Uc", MergeNone, "aarch64_sve_pmul">; -def SVPMUL_N : SInst<"svpmul[_n_{d}]", "dda", "Uc", MergeNone, "aarch64_sve_pmul">; -def SVPMULLB : SInst<"svpmullb[_{d}]", "dhh", "UsUl", MergeNone>; -def SVPMULLB_N : SInst<"svpmullb[_n_{d}]", "dhR", "UsUl", MergeNone>; -def SVPMULLB_PAIR : SInst<"svpmullb_pair[_{d}]", "ddd", "UcUi", MergeNone, "aarch64_sve_pmullb_pair">; -def SVPMULLB_PAIR_N : SInst<"svpmullb_pair[_n_{d}]", "dda", "UcUi", MergeNone, "aarch64_sve_pmullb_pair">; -def SVPMULLT : SInst<"svpmullt[_{d}]", "dhh", "UsUl", MergeNone>; -def SVPMULLT_N : SInst<"svpmullt[_n_{d}]", "dhR", "UsUl", MergeNone>; -def SVPMULLT_PAIR : SInst<"svpmullt_pair[_{d}]", "ddd", "UcUi", MergeNone, "aarch64_sve_pmullt_pair">; -def SVPMULLT_PAIR_N : SInst<"svpmullt_pair[_n_{d}]", "dda", "UcUi", MergeNone, "aarch64_sve_pmullt_pair">; +def SVEORBT : SInst<"sveorbt[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorbt", [IsStreamingCompatible]>; +def SVEORBT_N : SInst<"sveorbt[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorbt", [IsStreamingCompatible]>; +def SVEORTB : SInst<"sveortb[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eortb", [IsStreamingCompatible]>; +def SVEORTB_N : SInst<"sveortb[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_eortb", [IsStreamingCompatible]>; +def SVPMUL : SInst<"svpmul[_{d}]", "ddd", "Uc", MergeNone, "aarch64_sve_pmul", [IsStreamingCompatible]>; +def SVPMUL_N : SInst<"svpmul[_n_{d}]", "dda", "Uc", MergeNone, "aarch64_sve_pmul", [IsStreamingCompatible]>; +def SVPMULLB : SInst<"svpmullb[_{d}]", "dhh", "UsUl", MergeNone, "", [IsStreamingCompatible]>; +def SVPMULLB_N : SInst<"svpmullb[_n_{d}]", "dhR", "UsUl", MergeNone, "", [IsStreamingCompatible]>; +def SVPMULLB_PAIR : SInst<"svpmullb_pair[_{d}]", "ddd", "UcUi", MergeNone, "aarch64_sve_pmullb_pair", [IsStreamingCompatible]>; +def SVPMULLB_PAIR_N : SInst<"svpmullb_pair[_n_{d}]", "dda", "UcUi", MergeNone, "aarch64_sve_pmullb_pair", [IsStreamingCompatible]>; +def SVPMULLT : SInst<"svpmullt[_{d}]", "dhh", "UsUl", MergeNone, "", [IsStreamingCompatible]>; +def SVPMULLT_N : SInst<"svpmullt[_n_{d}]", "dhR", "UsUl", MergeNone, "", [IsStreamingCompatible]>; +def SVPMULLT_PAIR : SInst<"svpmullt_pair[_{d}]", "ddd", "UcUi", MergeNone, "aarch64_sve_pmullt_pair", [IsStreamingCompatible]>; +def SVPMULLT_PAIR_N : SInst<"svpmullt_pair[_n_{d}]", "dda", "UcUi", MergeNone, "aarch64_sve_pmullt_pair", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 - Complex integer dot product let TargetGuard = "sve2" in { -def SVCDOT : SInst<"svcdot[_{d}]", "ddqqi", "il", MergeNone, "aarch64_sve_cdot", [], [ImmCheck<3, ImmCheckComplexRotAll90>]>; -def SVCDOT_LANE : SInst<"svcdot_lane[_{d}]", "ddqqii", "il", MergeNone, "aarch64_sve_cdot_lane", [], [ImmCheck<4, ImmCheckComplexRotAll90>, +def SVCDOT : SInst<"svcdot[_{d}]", "ddqqi", "il", MergeNone, "aarch64_sve_cdot", [IsStreamingCompatible], [ImmCheck<3, ImmCheckComplexRotAll90>]>; +def SVCDOT_LANE : SInst<"svcdot_lane[_{d}]", "ddqqii", "il", MergeNone, "aarch64_sve_cdot_lane", [IsStreamingCompatible], [ImmCheck<4, ImmCheckComplexRotAll90>, ImmCheck<3, ImmCheckLaneIndexDot, 2>]>; } @@ -2001,27 +1815,27 @@ // SVE2 - Floating-point widening multiply-accumulate let TargetGuard = "sve2" in { -def SVMLALB_F : SInst<"svmlalb[_{d}]", "ddhh", "f", MergeNone, "aarch64_sve_fmlalb">; -def SVMLALB_F_N : SInst<"svmlalb[_n_{d}]", "ddhR", "f", MergeNone, "aarch64_sve_fmlalb">; -def SVMLALB_F_LANE : SInst<"svmlalb_lane[_{d}]", "ddhhi", "f", MergeNone, "aarch64_sve_fmlalb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLALT_F : SInst<"svmlalt[_{d}]", "ddhh", "f", MergeNone, "aarch64_sve_fmlalt">; -def SVMLALT_F_N : SInst<"svmlalt[_n_{d}]", "ddhR", "f", MergeNone, "aarch64_sve_fmlalt">; -def SVMLALT_F_LANE : SInst<"svmlalt_lane[_{d}]", "ddhhi", "f", MergeNone, "aarch64_sve_fmlalt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLSLB_F : SInst<"svmlslb[_{d}]", "ddhh", "f", MergeNone, "aarch64_sve_fmlslb">; -def SVMLSLB_F_N : SInst<"svmlslb[_n_{d}]", "ddhR", "f", MergeNone, "aarch64_sve_fmlslb">; -def SVMLSLB_F_LANE : SInst<"svmlslb_lane[_{d}]", "ddhhi", "f", MergeNone, "aarch64_sve_fmlslb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLSLT_F : SInst<"svmlslt[_{d}]", "ddhh", "f", MergeNone, "aarch64_sve_fmlslt">; -def SVMLSLT_F_N : SInst<"svmlslt[_n_{d}]", "ddhR", "f", MergeNone, "aarch64_sve_fmlslt">; -def SVMLSLT_F_LANE : SInst<"svmlslt_lane[_{d}]", "ddhhi", "f", MergeNone, "aarch64_sve_fmlslt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLALB_F : SInst<"svmlalb[_{d}]", "ddhh", "f", MergeNone, "aarch64_sve_fmlalb", [IsStreamingCompatible]>; +def SVMLALB_F_N : SInst<"svmlalb[_n_{d}]", "ddhR", "f", MergeNone, "aarch64_sve_fmlalb", [IsStreamingCompatible]>; +def SVMLALB_F_LANE : SInst<"svmlalb_lane[_{d}]", "ddhhi", "f", MergeNone, "aarch64_sve_fmlalb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLALT_F : SInst<"svmlalt[_{d}]", "ddhh", "f", MergeNone, "aarch64_sve_fmlalt", [IsStreamingCompatible]>; +def SVMLALT_F_N : SInst<"svmlalt[_n_{d}]", "ddhR", "f", MergeNone, "aarch64_sve_fmlalt", [IsStreamingCompatible]>; +def SVMLALT_F_LANE : SInst<"svmlalt_lane[_{d}]", "ddhhi", "f", MergeNone, "aarch64_sve_fmlalt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLSLB_F : SInst<"svmlslb[_{d}]", "ddhh", "f", MergeNone, "aarch64_sve_fmlslb", [IsStreamingCompatible]>; +def SVMLSLB_F_N : SInst<"svmlslb[_n_{d}]", "ddhR", "f", MergeNone, "aarch64_sve_fmlslb", [IsStreamingCompatible]>; +def SVMLSLB_F_LANE : SInst<"svmlslb_lane[_{d}]", "ddhhi", "f", MergeNone, "aarch64_sve_fmlslb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLSLT_F : SInst<"svmlslt[_{d}]", "ddhh", "f", MergeNone, "aarch64_sve_fmlslt", [IsStreamingCompatible]>; +def SVMLSLT_F_N : SInst<"svmlslt[_n_{d}]", "ddhR", "f", MergeNone, "aarch64_sve_fmlslt", [IsStreamingCompatible]>; +def SVMLSLT_F_LANE : SInst<"svmlslt_lane[_{d}]", "ddhhi", "f", MergeNone, "aarch64_sve_fmlslt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 - Floating-point integer binary logarithm let TargetGuard = "sve2" in { -def SVLOGB_M : SInst<"svlogb[_{d}]", "xxPd", "hfd", MergeOp1, "aarch64_sve_flogb">; -def SVLOGB_X : SInst<"svlogb[_{d}]", "xPd", "hfd", MergeAnyExp, "aarch64_sve_flogb">; -def SVLOGB_Z : SInst<"svlogb[_{d}]", "xPd", "hfd", MergeZeroExp, "aarch64_sve_flogb">; +def SVLOGB_M : SInst<"svlogb[_{d}]", "xxPd", "hfd", MergeOp1, "aarch64_sve_flogb", [IsStreamingCompatible]>; +def SVLOGB_X : SInst<"svlogb[_{d}]", "xPd", "hfd", MergeAnyExp, "aarch64_sve_flogb", [IsStreamingCompatible]>; +def SVLOGB_Z : SInst<"svlogb[_{d}]", "xPd", "hfd", MergeZeroExp, "aarch64_sve_flogb", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// @@ -2043,32 +1857,32 @@ //////////////////////////////////////////////////////////////////////////////// // SVE2 - Contiguous conflict detection let TargetGuard = "sve2" in { -def SVWHILERW_B : SInst<"svwhilerw[_{1}]", "Pcc", "cUc", MergeNone, "aarch64_sve_whilerw_b", [IsOverloadWhileRW]>; -def SVWHILERW_H : SInst<"svwhilerw[_{1}]", "Pcc", "sUsh", MergeNone, "aarch64_sve_whilerw_h", [IsOverloadWhileRW]>; -def SVWHILERW_S : SInst<"svwhilerw[_{1}]", "Pcc", "iUif", MergeNone, "aarch64_sve_whilerw_s", [IsOverloadWhileRW]>; -def SVWHILERW_D : SInst<"svwhilerw[_{1}]", "Pcc", "lUld", MergeNone, "aarch64_sve_whilerw_d", [IsOverloadWhileRW]>; +def SVWHILERW_B : SInst<"svwhilerw[_{1}]", "Pcc", "cUc", MergeNone, "aarch64_sve_whilerw_b", [IsOverloadWhileRW, IsStreamingCompatible]>; +def SVWHILERW_H : SInst<"svwhilerw[_{1}]", "Pcc", "sUsh", MergeNone, "aarch64_sve_whilerw_h", [IsOverloadWhileRW, IsStreamingCompatible]>; +def SVWHILERW_S : SInst<"svwhilerw[_{1}]", "Pcc", "iUif", MergeNone, "aarch64_sve_whilerw_s", [IsOverloadWhileRW, IsStreamingCompatible]>; +def SVWHILERW_D : SInst<"svwhilerw[_{1}]", "Pcc", "lUld", MergeNone, "aarch64_sve_whilerw_d", [IsOverloadWhileRW, IsStreamingCompatible]>; -def SVWHILEWR_B : SInst<"svwhilewr[_{1}]", "Pcc", "cUc", MergeNone, "aarch64_sve_whilewr_b", [IsOverloadWhileRW]>; -def SVWHILEWR_H : SInst<"svwhilewr[_{1}]", "Pcc", "sUsh", MergeNone, "aarch64_sve_whilewr_h", [IsOverloadWhileRW]>; -def SVWHILEWR_S : SInst<"svwhilewr[_{1}]", "Pcc", "iUif", MergeNone, "aarch64_sve_whilewr_s", [IsOverloadWhileRW]>; -def SVWHILEWR_D : SInst<"svwhilewr[_{1}]", "Pcc", "lUld", MergeNone, "aarch64_sve_whilewr_d", [IsOverloadWhileRW]>; +def SVWHILEWR_B : SInst<"svwhilewr[_{1}]", "Pcc", "cUc", MergeNone, "aarch64_sve_whilewr_b", [IsOverloadWhileRW, IsStreamingCompatible]>; +def SVWHILEWR_H : SInst<"svwhilewr[_{1}]", "Pcc", "sUsh", MergeNone, "aarch64_sve_whilewr_h", [IsOverloadWhileRW, IsStreamingCompatible]>; +def SVWHILEWR_S : SInst<"svwhilewr[_{1}]", "Pcc", "iUif", MergeNone, "aarch64_sve_whilewr_s", [IsOverloadWhileRW, IsStreamingCompatible]>; +def SVWHILEWR_D : SInst<"svwhilewr[_{1}]", "Pcc", "lUld", MergeNone, "aarch64_sve_whilewr_d", [IsOverloadWhileRW, IsStreamingCompatible]>; } let TargetGuard = "sve2,bf16" in { -def SVWHILERW_H_BF16 : SInst<"svwhilerw[_{1}]", "Pcc", "b", MergeNone, "aarch64_sve_whilerw_h", [IsOverloadWhileRW]>; -def SVWHILEWR_H_BF16 : SInst<"svwhilewr[_{1}]", "Pcc", "b", MergeNone, "aarch64_sve_whilewr_h", [IsOverloadWhileRW]>; +def SVWHILERW_H_BF16 : SInst<"svwhilerw[_{1}]", "Pcc", "b", MergeNone, "aarch64_sve_whilerw_h", [IsOverloadWhileRW, IsStreamingCompatible]>; +def SVWHILEWR_H_BF16 : SInst<"svwhilewr[_{1}]", "Pcc", "b", MergeNone, "aarch64_sve_whilewr_h", [IsOverloadWhileRW, IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 - Extended table lookup/permute let TargetGuard = "sve2" in { -def SVTBL2 : SInst<"svtbl2[_{d}]", "d2u", "csilUcUsUiUlhfd", MergeNone>; -def SVTBX : SInst<"svtbx[_{d}]", "dddu", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_tbx">; +def SVTBL2 : SInst<"svtbl2[_{d}]", "d2u", "csilUcUsUiUlhfd", MergeNone, "", [IsStreamingCompatible]>; +def SVTBX : SInst<"svtbx[_{d}]", "dddu", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_tbx", [IsStreamingCompatible]>; } let TargetGuard = "sve2,bf16" in { -def SVTBL2_BF16 : SInst<"svtbl2[_{d}]", "d2u", "b", MergeNone>; -def SVTBX_BF16 : SInst<"svtbx[_{d}]", "dddu", "b", MergeNone, "aarch64_sve_tbx">; +def SVTBL2_BF16 : SInst<"svtbl2[_{d}]", "d2u", "b", MergeNone, "", [IsStreamingCompatible]>; +def SVTBX_BF16 : SInst<"svtbx[_{d}]", "dddu", "b", MergeNone, "aarch64_sve_tbx", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// @@ -2104,3 +1918,337 @@ def SVBGRP : SInst<"svbgrp[_{d}]", "ddd", "UcUsUiUl", MergeNone, "aarch64_sve_bgrp_x">; def SVBGRP_N : SInst<"svbgrp[_n_{d}]", "dda", "UcUsUiUl", MergeNone, "aarch64_sve_bgrp_x">; } + +//////////////////////////////////////////////////////////////////////////////// +// SME + +let TargetGuard = "sme" in { +def SVSCLAMP : SInst<"svclamp[_{d}]", "dddd", "csil", MergeNone, "aarch64_sve_sclamp", [IsStreamingCompatible], []>; +def SVUCLAMP : SInst<"svclamp[_{d}]", "dddd", "UcUsUiUl", MergeNone, "aarch64_sve_uclamp", [IsStreamingCompatible], []>; +defm SVREVD : SInstZPZ<"svrevd", "csilUcUsUiUl", "aarch64_sve_revd">; + +def SVPSEL_B : SInst<"svpsel_lane_b8", "PPPmi", "Pc", MergeNone, "", [IsStreamingCompatible, IsZASliceBaseOffsetIntr], [ImmCheck<3, ImmCheck0_15>]>; +def SVPSEL_H : SInst<"svpsel_lane_b16", "PPPmi", "Ps", MergeNone, "", [IsStreamingCompatible, IsZASliceBaseOffsetIntr], [ImmCheck<3, ImmCheck0_7>]>; +def SVPSEL_S : SInst<"svpsel_lane_b32", "PPPmi", "Pi", MergeNone, "", [IsStreamingCompatible, IsZASliceBaseOffsetIntr], [ImmCheck<3, ImmCheck0_3>]>; +def SVPSEL_D : SInst<"svpsel_lane_b64", "PPPmi", "Pl", MergeNone, "", [IsStreamingCompatible, IsZASliceBaseOffsetIntr], [ImmCheck<3, ImmCheck0_1>]>; +} + +let TargetGuard = "sme2" in { +def SVPSEL_COUNT_ALIAS_B : SInst<"svpsel_lane_c8", "}}Pmi", "Pc", MergeNone, "", [IsStreamingCompatible, IsZASliceBaseOffsetIntr], [ImmCheck<3, ImmCheck0_15>]>; +def SVPSEL_COUNT_ALIAS_H : SInst<"svpsel_lane_c16", "}}Pmi", "Ps", MergeNone, "", [IsStreamingCompatible, IsZASliceBaseOffsetIntr], [ImmCheck<3, ImmCheck0_7>]>; +def SVPSEL_COUNT_ALIAS_S : SInst<"svpsel_lane_c32", "}}Pmi", "Pi", MergeNone, "", [IsStreamingCompatible, IsZASliceBaseOffsetIntr], [ImmCheck<3, ImmCheck0_3>]>; +def SVPSEL_COUNT_ALIAS_D : SInst<"svpsel_lane_c64", "}}Pmi", "Pl", MergeNone, "", [IsStreamingCompatible, IsZASliceBaseOffsetIntr], [ImmCheck<3, ImmCheck0_1>]>; +} + +let TargetGuard = "sme2" in { +def REINTERPRET_SVBOOL_TO_SVCOUNT : Inst<"svreinterpret[_c]", "}P", "Pc", MergeNone, "", [IsStreamingCompatible], [], MemEltTyDefault>; +def REINTERPRET_SVCOUNT_TO_SVBOOL : Inst<"svreinterpret[_b]", "P}", "Pc", MergeNone, "", [IsStreamingCompatible], [], MemEltTyDefault>; + +def SVPTRUE_COUNT : SInst<"svptrue_{d}", "}v", "QcQsQiQl", MergeNone, "aarch64_sve_ptrue_{d}", [IsStreaming, IsOverloadNone], []>; + +def SVPEXT_SINGLE : SInst<"svpext_lane_{d}", "P}i", "QcQsQiQl", MergeNone, "aarch64_sve_pext", [IsStreaming], [ImmCheck<1, ImmCheck0_3>]>; +def SVPEXT_X2 : SInst<"svpext_lane_{d}_x2", "2.P}i", "QcQsQiQl", MergeNone, "aarch64_sve_pext_x2", [IsStreaming], [ImmCheck<1, ImmCheck0_1>]>; + +def SVCNTP_COUNT : SInst<"svcntp_{d}", "n}i", "QcQsQiQl", MergeNone, "aarch64_sve_cntp_{d}", [IsStreaming, IsOverloadNone], [ImmCheck<1, ImmCheck2_4_Mul2>]>; + +def SVLD1B_VG2 : MInst<"svld1[_{2}]_x2", "2}c", "cUc", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ld1_pn_vg2">; +def SVLD1H_VG2 : MInst<"svld1[_{2}]_x2", "2}c", "sUshb", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ld1_pn_vg2">; +def SVLD1W_VG2 : MInst<"svld1[_{2}]_x2", "2}c", "iUif", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ld1_pn_vg2">; +def SVLD1D_VG2 : MInst<"svld1[_{2}]_x2", "2}c", "lUld", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ld1_pn_vg2">; +def SVLD1B_VG4 : MInst<"svld1[_{2}]_x4", "4}c", "cUc", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ld1_pn_vg4">; +def SVLD1H_VG4 : MInst<"svld1[_{2}]_x4", "4}c", "sUshb", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ld1_pn_vg4">; +def SVLD1W_VG4 : MInst<"svld1[_{2}]_x4", "4}c", "iUif", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ld1_pn_vg4">; +def SVLD1D_VG4 : MInst<"svld1[_{2}]_x4", "4}c", "lUld", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ld1_pn_vg4">; + +def SVLDNT1B_VG2 : MInst<"svldnt1[_{2}]_x2", "2}c", "cUc", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ldnt1_pn_vg2">; +def SVLDNT1H_VG2 : MInst<"svldnt1[_{2}]_x2", "2}c", "sUshb", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ldnt1_pn_vg2">; +def SVLDNT1W_VG2 : MInst<"svldnt1[_{2}]_x2", "2}c", "iUif", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ldnt1_pn_vg2">; +def SVLDNT1D_VG2 : MInst<"svldnt1[_{2}]_x2", "2}c", "lUld", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ldnt1_pn_vg2">; +def SVLDNT1B_VG4 : MInst<"svldnt1[_{2}]_x4", "4}c", "cUc", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ldnt1_pn_vg4">; +def SVLDNT1H_VG4 : MInst<"svldnt1[_{2}]_x4", "4}c", "sUshb", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ldnt1_pn_vg4">; +def SVLDNT1W_VG4 : MInst<"svldnt1[_{2}]_x4", "4}c", "iUif", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ldnt1_pn_vg4">; +def SVLDNT1D_VG4 : MInst<"svldnt1[_{2}]_x4", "4}c", "lUld", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ldnt1_pn_vg4">; + +def SVLD1B_VNUM_VG2 : MInst<"svld1_vnum[_{2}]_x2", "2}cl", "cUc", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ld1_pn_vg2">; +def SVLD1H_VNUM_VG2 : MInst<"svld1_vnum[_{2}]_x2", "2}cl", "sUshb", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ld1_pn_vg2">; +def SVLD1W_VNUM_VG2 : MInst<"svld1_vnum[_{2}]_x2", "2}cl", "iUif", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ld1_pn_vg2">; +def SVLD1D_VNUM_VG2 : MInst<"svld1_vnum[_{2}]_x2", "2}cl", "lUld", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ld1_pn_vg2">; +def SVLD1B_VNUM_VG4 : MInst<"svld1_vnum[_{2}]_x4", "4}cl", "cUc", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ld1_pn_vg4">; +def SVLD1H_VNUM_VG4 : MInst<"svld1_vnum[_{2}]_x4", "4}cl", "sUshb", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ld1_pn_vg4">; +def SVLD1W_VNUM_VG4 : MInst<"svld1_vnum[_{2}]_x4", "4}cl", "iUif", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ld1_pn_vg4">; +def SVLD1D_VNUM_VG4 : MInst<"svld1_vnum[_{2}]_x4", "4}cl", "lUld", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ld1_pn_vg4">; + +def SVLDNT1B_VNUM_VG2 : MInst<"svldnt1_vnum[_{2}]_x2", "2}cl", "cUc", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ldnt1_pn_vg2">; +def SVLDNT1H_VNUM_VG2 : MInst<"svldnt1_vnum[_{2}]_x2", "2}cl", "sUshb", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ldnt1_pn_vg2">; +def SVLDNT1W_VNUM_VG2 : MInst<"svldnt1_vnum[_{2}]_x2", "2}cl", "iUif", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ldnt1_pn_vg2">; +def SVLDNT1D_VNUM_VG2 : MInst<"svldnt1_vnum[_{2}]_x2", "2}cl", "lUld", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ldnt1_pn_vg2">; +def SVLDNT1B_VNUM_VG4 : MInst<"svldnt1_vnum[_{2}]_x4", "4}cl", "cUc", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ldnt1_pn_vg4">; +def SVLDNT1H_VNUM_VG4 : MInst<"svldnt1_vnum[_{2}]_x4", "4}cl", "sUshb", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ldnt1_pn_vg4">; +def SVLDNT1W_VNUM_VG4 : MInst<"svldnt1_vnum[_{2}]_x4", "4}cl", "iUif", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ldnt1_pn_vg4">; +def SVLDNT1D_VNUM_VG4 : MInst<"svldnt1_vnum[_{2}]_x4", "4}cl", "lUld", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ldnt1_pn_vg4">; + +def SVST1B_VG2 : MInst<"svst1[_{2}_x2]", "v}p2", "cUc", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_st1_pn_vg2">; +def SVST1H_VG2 : MInst<"svst1[_{2}_x2]", "v}p2", "sUshb", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_st1_pn_vg2">; +def SVST1W_VG2 : MInst<"svst1[_{2}_x2]", "v}p2", "iUif", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_st1_pn_vg2">; +def SVST1D_VG2 : MInst<"svst1[_{2}_x2]", "v}p2", "lUld", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_st1_pn_vg2">; +def SVST1B_VG4 : MInst<"svst1[_{2}_x4]", "v}p4", "cUc", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_st1_pn_vg4">; +def SVST1H_VG4 : MInst<"svst1[_{2}_x4]", "v}p4", "sUshb", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_st1_pn_vg4">; +def SVST1W_VG4 : MInst<"svst1[_{2}_x4]", "v}p4", "iUif", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_st1_pn_vg4">; +def SVST1D_VG4 : MInst<"svst1[_{2}_x4]", "v}p4", "lUld", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_st1_pn_vg4">; + +def SVST1B_VNUM_VG2 : MInst<"svst1_vnum[_{2}_x2]", "v}pl2", "cUc", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_st1_pn_vg2">; +def SVST1H_VNUM_VG2 : MInst<"svst1_vnum[_{2}_x2]", "v}pl2", "sUshb", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_st1_pn_vg2">; +def SVST1W_VNUM_VG2 : MInst<"svst1_vnum[_{2}_x2]", "v}pl2", "iUif", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_st1_pn_vg2">; +def SVST1D_VNUM_VG2 : MInst<"svst1_vnum[_{2}_x2]", "v}pl2", "lUld", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_st1_pn_vg2">; +def SVST1B_VNUM_VG4 : MInst<"svst1_vnum[_{2}_x4]", "v}pl4", "cUc", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_st1_pn_vg4">; +def SVST1H_VNUM_VG4 : MInst<"svst1_vnum[_{2}_x4]", "v}pl4", "sUshb", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_st1_pn_vg4">; +def SVST1W_VNUM_VG4 : MInst<"svst1_vnum[_{2}_x4]", "v}pl4", "iUif", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_st1_pn_vg4">; +def SVST1D_VNUM_VG4 : MInst<"svst1_vnum[_{2}_x4]", "v}pl4", "lUld", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_st1_pn_vg4">; + +def SVSTNT1B_VG2 : MInst<"svstnt1[_{2}_x2]", "v}p2", "cUc", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_stnt1_pn_vg2">; +def SVSTNT1H_VG2 : MInst<"svstnt1[_{2}_x2]", "v}p2", "sUshb", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_stnt1_pn_vg2">; +def SVSTNT1W_VG2 : MInst<"svstnt1[_{2}_x2]", "v}p2", "iUif", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_stnt1_pn_vg2">; +def SVSTNT1D_VG2 : MInst<"svstnt1[_{2}_x2]", "v}p2", "lUld", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_stnt1_pn_vg2">; +def SVSTNT1B_VG4 : MInst<"svstnt1[_{2}_x4]", "v}p4", "cUc", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_stnt1_pn_vg4">; +def SVSTNT1H_VG4 : MInst<"svstnt1[_{2}_x4]", "v}p4", "sUshb", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_stnt1_pn_vg4">; +def SVSTNT1W_VG4 : MInst<"svstnt1[_{2}_x4]", "v}p4", "iUif", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_stnt1_pn_vg4">; +def SVSTNT1D_VG4 : MInst<"svstnt1[_{2}_x4]", "v}p4", "lUld", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_stnt1_pn_vg4">; + +def SVSTNT1B_VNUM_VG2 : MInst<"svstnt1_vnum[_{2}_x2]", "v}pl2", "cUc", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_stnt1_pn_vg2">; +def SVSTNT1H_VNUM_VG2 : MInst<"svstnt1_vnum[_{2}_x2]", "v}pl2", "sUshb", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_stnt1_pn_vg2">; +def SVSTNT1W_VNUM_VG2 : MInst<"svstnt1_vnum[_{2}_x2]", "v}pl2", "iUif", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_stnt1_pn_vg2">; +def SVSTNT1D_VNUM_VG2 : MInst<"svstnt1_vnum[_{2}_x2]", "v}pl2", "lUld", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_stnt1_pn_vg2">; +def SVSTNT1B_VNUM_VG4 : MInst<"svstnt1_vnum[_{2}_x4]", "v}pl4", "cUc", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_stnt1_pn_vg4">; +def SVSTNT1H_VNUM_VG4 : MInst<"svstnt1_vnum[_{2}_x4]", "v}pl4", "sUshb", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_stnt1_pn_vg4">; +def SVSTNT1W_VNUM_VG4 : MInst<"svstnt1_vnum[_{2}_x4]", "v}pl4", "iUif", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_stnt1_pn_vg4">; +def SVSTNT1D_VNUM_VG4 : MInst<"svstnt1_vnum[_{2}_x4]", "v}pl4", "lUld", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_stnt1_pn_vg4">; + +def SVWHILEGE_COUNT : SInst<"svwhilege_{d}", "}lli", "QcQsQiQl", MergeNone, "aarch64_sve_whilege_{d}", [IsStreaming, IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>; +def SVWHILEGT_COUNT : SInst<"svwhilegt_{d}", "}lli", "QcQsQiQl", MergeNone, "aarch64_sve_whilegt_{d}", [IsStreaming, IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>; +def SVWHILELE_COUNT : SInst<"svwhilele_{d}", "}lli", "QcQsQiQl", MergeNone, "aarch64_sve_whilele_{d}", [IsStreaming, IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>; +def SVWHILELT_COUNT : SInst<"svwhilelt_{d}", "}lli", "QcQsQiQl", MergeNone, "aarch64_sve_whilelt_{d}", [IsStreaming, IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>; +def SVWHILELO_COUNT : SInst<"svwhilelo_{d}", "}nni", "QcQsQiQl", MergeNone, "aarch64_sve_whilelo_{d}", [IsStreaming, IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>; +def SVWHILELS_COUNT : SInst<"svwhilels_{d}", "}nni", "QcQsQiQl", MergeNone, "aarch64_sve_whilels_{d}", [IsStreaming, IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>; +def SVWHILEHI_COUNT : SInst<"svwhilehi_{d}", "}nni", "QcQsQiQl", MergeNone, "aarch64_sve_whilehi_{d}", [IsStreaming, IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>; +def SVWHILEHS_COUNT : SInst<"svwhilehs_{d}", "}nni", "QcQsQiQl", MergeNone, "aarch64_sve_whilehs_{d}", [IsStreaming, IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>; +def SVWHILEGE_X2 : SInst<"svwhilege_{d}_x2", "2ll", "PcPsPiPl", MergeNone, "aarch64_sve_whilege_x2", [IsStreamingCompatible], []>; +def SVWHILEGT_X2 : SInst<"svwhilegt_{d}_x2", "2ll", "PcPsPiPl", MergeNone, "aarch64_sve_whilegt_x2", [IsStreamingCompatible], []>; +def SVWHILELE_X2 : SInst<"svwhilele_{d}_x2", "2ll", "PcPsPiPl", MergeNone, "aarch64_sve_whilele_x2", [IsStreamingCompatible], []>; +def SVWHILELT_X2 : SInst<"svwhilelt_{d}_x2", "2ll", "PcPsPiPl", MergeNone, "aarch64_sve_whilelt_x2", [IsStreamingCompatible], []>; +def SVWHILELO_X2 : SInst<"svwhilelo_{d}_x2", "2nn", "PcPsPiPl", MergeNone, "aarch64_sve_whilelo_x2", [IsStreamingCompatible], []>; +def SVWHILELS_X2 : SInst<"svwhilels_{d}_x2", "2nn", "PcPsPiPl", MergeNone, "aarch64_sve_whilels_x2", [IsStreamingCompatible], []>; +def SVWHILEHI_X2 : SInst<"svwhilehi_{d}_x2", "2nn", "PcPsPiPl", MergeNone, "aarch64_sve_whilehi_x2", [IsStreamingCompatible], []>; +def SVWHILEHS_X2 : SInst<"svwhilehs_{d}_x2", "2nn", "PcPsPiPl", MergeNone, "aarch64_sve_whilehs_x2", [IsStreamingCompatible], []>; +} + +//////////////////////////////////////////////////////////////////////////////// +// SME2 + +let TargetGuard = "sme2" in { + // SRSHL / URSHL + def SVSRSHL_SINGLE_X2 : SInst<"svrshl[_single_{d}_x2]", "22d", "csil", MergeNone, "aarch64_sve_srshl_single_x2", [IsStreaming], []>; + def SVURSHL_SINGLE_X2 : SInst<"svrshl[_single_{d}_x2]", "22d", "UcUsUiUl", MergeNone, "aarch64_sve_urshl_single_x2", [IsStreaming], []>; + def SVSRSHL_SINGLE_X4 : SInst<"svrshl[_single_{d}_x4]", "44d", "csil", MergeNone, "aarch64_sve_srshl_single_x4", [IsStreaming], []>; + def SVURSHL_SINGLE_X4 : SInst<"svrshl[_single_{d}_x4]", "44d", "UcUsUiUl", MergeNone, "aarch64_sve_urshl_single_x4", [IsStreaming], []>; + + def SVSRSHL_X2 : SInst<"svrshl[_{d}_x2]", "222", "csil", MergeNone, "aarch64_sve_srshl_x2", [IsStreaming], []>; + def SVURSHL_X2 : SInst<"svrshl[_{d}_x2]", "222", "UcUsUiUl", MergeNone, "aarch64_sve_urshl_x2", [IsStreaming], []>; + def SVSRSHL_X4 : SInst<"svrshl[_{d}_x4]", "444", "csil", MergeNone, "aarch64_sve_srshl_x4", [IsStreaming], []>; + def SVURSHL_X4 : SInst<"svrshl[_{d}_x4]", "444", "UcUsUiUl", MergeNone, "aarch64_sve_urshl_x4", [IsStreaming], []>; + + // SQRSHR / UQRSHR + def SVQRSHR_X2 : SInst<"svqrshr[_{0}_x2]", "h2i", "i", MergeNone, "aarch64_sve_sqrshr_x2", [IsStreaming], [ImmCheck<1, ImmCheck1_16>]>; + def SVUQRSHR_X2 : SInst<"svqrshr[_{0}_x2]", "e2i", "Ui", MergeNone, "aarch64_sve_uqrshr_x2", [IsStreaming], [ImmCheck<1, ImmCheck1_16>]>; + def SVQRSHR_X4 : SInst<"svqrshr[_{0}_x4]", "q4i", "il", MergeNone, "aarch64_sve_sqrshr_x4", [IsStreaming], [ImmCheck<1, ImmCheckShiftRight, 0>]>; + def SVUQRSHR_X4 : SInst<"svqrshr[_{0}_x4]", "b4i", "UiUl", MergeNone, "aarch64_sve_uqrshr_x4", [IsStreaming], [ImmCheck<1, ImmCheckShiftRight, 0>]>; + + // SQRSHRN / UQRSHRN + def SVQRSHRN_X2 : SInst<"svqrshrn[_{0}_x2]", "h2i", "i", MergeNone, "aarch64_sve_sqrshrn_x2", [IsStreamingCompatible], [ImmCheck<1, ImmCheck1_16>]>; + def SVUQRSHRN_X2 : SInst<"svqrshrn[_{0}_x2]", "e2i", "Ui", MergeNone, "aarch64_sve_uqrshrn_x2", [IsStreamingCompatible], [ImmCheck<1, ImmCheck1_16>]>; + def SVQRSHRN_X4 : SInst<"svqrshrn[_{0}_x4]", "q4i", "il", MergeNone, "aarch64_sve_sqrshrn_x4", [IsStreaming], [ImmCheck<1, ImmCheckShiftRight, 0>]>; + def SVUQRSHRN_X4 : SInst<"svqrshrn[_{0}_x4]", "b4i", "UiUl", MergeNone, "aarch64_sve_uqrshrn_x4", [IsStreaming], [ImmCheck<1, ImmCheckShiftRight, 0>]>; + + // SQRSHRU + def SVSQRSHRU_X2 : SInst<"svsqrshru[_{0}_x2]", "e2i", "i", MergeNone, "aarch64_sve_sqrshru_x2", [IsStreaming], [ImmCheck<1, ImmCheck1_16>]>; + def SVSQRSHRU_X4 : SInst<"svsqrshru[_{0}_x4]", "b4i", "il", MergeNone, "aarch64_sve_sqrshru_x4", [IsStreaming], [ImmCheck<1, ImmCheckShiftRight, 0>]>; + + // SQRSHRUN + def SVSQRSHRUN_X2 : SInst<"svsqrshrun[_{0}_x2]", "e2i", "i", MergeNone, "aarch64_sve_sqrshrun_x2", [IsStreamingCompatible], [ImmCheck<1, ImmCheck1_16>]>; + def SVSQRSHRUN_X4 : SInst<"svsqrshrun[_{0}_x4]", "b4i", "il", MergeNone, "aarch64_sve_sqrshrun_x4", [IsStreaming], [ImmCheck<1, ImmCheckShiftRight, 0>]>; + + def SVZIP_X2 : SInst<"svzip[_{d}]_x2", "2dd", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_zip_x2", [IsStreaming], []>; + def SVZIPQ_X2 : SInst<"svzipq[_{d}]_x2", "2dd", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_zipq_x2", [IsStreaming], []>; + def SVZIP_X4 : SInst<"svzip[_{d}]_x4", "4dddd", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_zip_x4", [IsStreaming], []>; + def SVZIPQ_X4 : SInst<"svzipq[_{d}]_x4", "4dddd", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_zipq_x4", [IsStreaming], []>; + + def SVUZP_X2 : SInst<"svuzp[_{d}]_x2", "2dd", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_uzp_x2", [IsStreaming], []>; + def SVUZPQ_X2 : SInst<"svuzpq[_{d}]_x2", "2dd", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_uzpq_x2", [IsStreaming], []>; + def SVUZP_X4 : SInst<"svuzp[_{d}]_x4", "4dddd", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_uzp_x4", [IsStreaming], []>; + def SVUZPQ_X4 : SInst<"svuzpq[_{d}]_x4", "4dddd", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_uzpq_x4", [IsStreaming], []>; + + // SQDMULH + def SVSQDMULH_SINGLE_X2 : SInst<"svsqdmulh[_single_{d}_x2]", "22d", "csil", MergeNone, "aarch64_sve_sqdmulh_single_vgx2", [IsStreaming], []>; + def SVSQDMULH_SINGLE_X4 : SInst<"svsqdmulh[_single_{d}_x4]", "44d", "csil", MergeNone, "aarch64_sve_sqdmulh_single_vgx4", [IsStreaming], []>; + def SVSQDMULH_X2 : SInst<"svsqdmulh[_{d}_x2]", "222", "csil", MergeNone, "aarch64_sve_sqdmulh_vgx2", [IsStreaming], []>; + def SVSQDMULH_X4 : SInst<"svsqdmulh[_{d}_x4]", "444", "csil", MergeNone, "aarch64_sve_sqdmulh_vgx4", [IsStreaming], []>; + + // 2-way dot products + def SVDOT_X2_S : SInst<"svdot[_{d}_{2}_{3}]", "ddhh", "i", MergeNone, "aarch64_sve_sdot_x2", [IsStreamingCompatible], []>; + def SVDOT_X2_U : SInst<"svdot[_{d}_{2}_{3}]", "ddhh", "Ui", MergeNone, "aarch64_sve_udot_x2", [IsStreamingCompatible], []>; + def SVDOT_X2_F : SInst<"svdot[_{d}_{2}_{3}]", "ddhh", "f", MergeNone, "aarch64_sve_fdot_x2", [IsStreamingCompatible], []>; + def SVDOT_LANE_X2_S : SInst<"svdot_lane[_{d}_{2}_{3}]", "ddhhi", "i", MergeNone, "aarch64_sve_sdot_lane_x2", [IsStreamingCompatible], [ImmCheck<3, ImmCheck0_3>]>; + def SVDOT_LANE_X2_U : SInst<"svdot_lane[_{d}_{2}_{3}]", "ddhhi", "Ui", MergeNone, "aarch64_sve_udot_lane_x2", [IsStreamingCompatible], [ImmCheck<3, ImmCheck0_3>]>; + def SVDOT_LANE_X2_F : SInst<"svdot_lane[_{d}_{2}_{3}]", "ddhhi", "f", MergeNone, "aarch64_sve_fdot_lane_x2", [IsStreamingCompatible], [ImmCheck<3, ImmCheck0_3>]>; + + // 2-vector and 4-vector signed and unsigned unpacks + def SVSUNPK_X2 : SInst<"svunpk[_{d}_x2]", "2h", "sil", MergeNone, "aarch64_sve_sunpk_x2", [IsStreaming], []>; + def SVUUNPK_X2 : SInst<"svunpk[_{d}_x2]", "2h", "UsUiUl", MergeNone, "aarch64_sve_uunpk_x2", [IsStreaming], []>; + def SVSUNPK_X4 : SInst<"svunpk[_{d}_x4]", "42.h", "sil", MergeNone, "aarch64_sve_sunpk_x4", [IsStreaming], []>; + def SVUUNPK_X4 : SInst<"svunpk[_{d}_x4]", "42.h", "UsUiUl", MergeNone, "aarch64_sve_uunpk_x4", [IsStreaming], []>; + + // CLAMPS + def SVFCLAMP : SInst<"svclamp[_{d}]", "dddd", "hfd", MergeNone, "aarch64_sve_fclamp", [IsStreamingCompatible], []>; + + def SVSCLAMP_X2 : SInst<"svclamp[_single_{d}_x2]", "22dd", "csil", MergeNone, "aarch64_sve_sclamp_single_x2", [IsStreaming], []>; + def SVUCLAMP_X2 : SInst<"svclamp[_single_{d}_x2]", "22dd", "UcUsUiUl", MergeNone, "aarch64_sve_uclamp_single_x2", [IsStreaming], []>; + def SVFCLAMP_X2 : SInst<"svclamp[_single_{d}_x2]", "22dd", "hfd", MergeNone, "aarch64_sve_fclamp_single_x2", [IsStreaming], []>; + + def SVSCLAMP_X4 : SInst<"svclamp[_single_{d}_x4]", "44dd", "csil", MergeNone, "aarch64_sve_sclamp_single_x4", [IsStreaming], []>; + def SVUCLAMP_X4 : SInst<"svclamp[_single_{d}_x4]", "44dd", "UcUsUiUl", MergeNone, "aarch64_sve_uclamp_single_x4", [IsStreaming], []>; + def SVFCLAMP_X4 : SInst<"svclamp[_single_{d}_x4]", "44dd", "hfd", MergeNone, "aarch64_sve_fclamp_single_x4", [IsStreaming], []>; + + // 2-way and 4-way selects + def SVSEL_X2 : SInst<"svsel[_{d}_x2]", "2}22", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_sel_x2", [IsStreaming], []>; + def SVSEL_X4 : SInst<"svsel[_{d}_x4]", "4}44", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_sel_x4", [IsStreaming], []>; +} + +multiclass MinMaxIntr { + def SVS # NAME : SInst<"sv" # i # "[" # zm # "_{d}_" # mul # "]", t, "csil", MergeNone, "aarch64_sve_s" # i # zm # "_" # mul, [IsStreaming], []>; + def SVU # NAME : SInst<"sv" # i # "[" # zm # "_{d}_" # mul # "]", t, "UcUsUiUl", MergeNone, "aarch64_sve_u" # i # zm # "_" # mul, [IsStreaming], []>; + def SVF # NAME : SInst<"sv" # i # "[" # zm # "_{d}_" # mul # "]", t, "hfd", MergeNone, "aarch64_sve_f" # i # zm # "_" # mul, [IsStreaming], []>; +} + +let TargetGuard = "sme2" in { + // SMAX / UMAX / FMAX + defm MAX_SINGLE_X2 : MinMaxIntr<"max", "_single", "x2", "22d">; + defm MAX_MULTI_X2 : MinMaxIntr<"max", "", "x2", "222">; + defm MAX_SINGLE_X4 : MinMaxIntr<"max", "_single", "x4", "44d">; + defm MAX_MULTI_X4 : MinMaxIntr<"max", "", "x4", "444">; + + // SMIN / UMIN / FMIN + defm MIN_SINGLE_X2 : MinMaxIntr<"min", "_single", "x2", "22d">; + defm MIN_MULTI_X2 : MinMaxIntr<"min", "", "x2", "222">; + defm MIN_SINGLE_X4 : MinMaxIntr<"min", "_single", "x4", "44d">; + defm MIN_MULTI_X4 : MinMaxIntr<"min", "", "x4", "444">; +} + +let TargetGuard = "sme2" in { + // FMINNM / FMAXNM + def SVMAXNM_SINGLE_X2 : SInst<"svmaxnm[_single_{d}_x2]", "22d", "hfd", MergeNone, "aarch64_sve_fmaxnm_single_x2", [IsStreaming], []>; + def SVMAXNM_SINGLE_X4 : SInst<"svmaxnm[_single_{d}_x4]", "44d", "hfd", MergeNone, "aarch64_sve_fmaxnm_single_x4", [IsStreaming], []>; + + def SVMAXNM_X2 : SInst<"svmaxnm[_{d}_x2]", "222", "hfd", MergeNone, "aarch64_sve_fmaxnm_x2", [IsStreaming], []>; + def SVMAXNM_X4 : SInst<"svmaxnm[_{d}_x4]", "444", "hfd", MergeNone, "aarch64_sve_fmaxnm_x4", [IsStreaming], []>; + + def SVMINNM_SINGLE_X2 : SInst<"svminnm[_single_{d}_x2]", "22d", "hfd", MergeNone, "aarch64_sve_fminnm_single_x2", [IsStreaming], []>; + def SVMINNM_SINGLE_X4 : SInst<"svminnm[_single_{d}_x4]", "44d", "hfd", MergeNone, "aarch64_sve_fminnm_single_x4", [IsStreaming], []>; + + def SVMINNM_X2 : SInst<"svminnm[_{d}_x2]", "222", "hfd", MergeNone, "aarch64_sve_fminnm_x2", [IsStreaming], []>; + def SVMINNM_X4 : SInst<"svminnm[_{d}_x4]", "444", "hfd", MergeNone, "aarch64_sve_fminnm_x4", [IsStreaming], []>; +} + +let TargetGuard = "sme2" in { + // FRINTA / FRINTM / FRINTN / FRINTP + def SVRINTA_X2 : SInst<"svrinta[_{d}_x2]", "22", "f", MergeNone, "aarch64_sve_frinta_x2", [IsStreaming], []>; + def SVRINTA_X4 : SInst<"svrinta[_{d}_x4]", "44", "f", MergeNone, "aarch64_sve_frinta_x4", [IsStreaming], []>; + + def SVRINTM_X2 : SInst<"svrintm[_{d}_x2]", "22", "f", MergeNone, "aarch64_sve_frintm_x2", [IsStreaming], []>; + def SVRINTM_X4 : SInst<"svrintm[_{d}_x4]", "44", "f", MergeNone, "aarch64_sve_frintm_x4", [IsStreaming], []>; + + def SVRINTN_X2 : SInst<"svrintn[_{d}_x2]", "22", "f", MergeNone, "aarch64_sve_frintn_x2", [IsStreaming], []>; + def SVRINTN_X4 : SInst<"svrintn[_{d}_x4]", "44", "f", MergeNone, "aarch64_sve_frintn_x4", [IsStreaming], []>; + + def SVRINTP_X2 : SInst<"svrintp[_{d}_x2]", "22", "f", MergeNone, "aarch64_sve_frintp_x2", [IsStreaming], []>; + def SVRINTP_X4 : SInst<"svrintp[_{d}_x4]", "44", "f", MergeNone, "aarch64_sve_frintp_x4", [IsStreaming], []>; +} + +let TargetGuard = "sme2" in { +// == ADD (vectors) == + def SVADD_SINGLE_X2 : SInst<"svadd[_single_{d}_x2]", "22d", "cUcsUsiUilUl", MergeNone, "aarch64_sve_add_single_x2", [IsStreaming], []>; + def SVADD_SINGLE_X4 : SInst<"svadd[_single_{d}_x4]", "44d", "cUcsUsiUilUl", MergeNone, "aarch64_sve_add_single_x4", [IsStreaming], []>; +} + +// +// Multi-vector floating-point convert from single-precision to interleaved half-precision/BFloat16 +// +let TargetGuard = "sme2" in { + def SVCVTN_F16_X2 : SInst<"svcvtn_f16[_f32_x2]", "e2", "f", MergeNone, "aarch64_sve_fcvtn_x2", [IsStreaming],[]>; + def SVCVTN_BF16_X2 : SInst<"svcvtn_bf16[_f32_x2]", "$2", "f", MergeNone, "aarch64_sve_bfcvtn_x2", [IsOverloadNone, IsStreaming],[]>; +} + +// +// Multi-vector convert to/from floating-point. +// +let TargetGuard = "sme2" in { + def SVCVT_F16_X2 : SInst<"svcvt_f16[_f32_x2]", "e2", "f", MergeNone, "aarch64_sve_fcvt_x2", [IsStreaming],[]>; + def SVCVT_BF16_X2 : SInst<"svcvt_bf16[_f32_x2]", "$2", "f", MergeNone, "aarch64_sve_bfcvt_x2", [IsOverloadNone, IsStreaming],[]>; +} + +// +// Multi-vector convert to/from floating-point. +// +let TargetGuard = "sme2" in { + def SVCVT_F32_U32_X2 : SInst<"svcvt_{d}[_u32_x2]", "2.d2.u", "f", MergeNone, "aarch64_sve_fcvtu_x2", [IsStreaming], []>; + def SVCVT_U32_F32_X2 : SInst<"svcvt_u32[_{d}_x2]", "2.u2.d", "f", MergeNone, "aarch64_sve_ucvtf_x2", [IsStreaming], []>; + def SVCVT_F32_S32_X2 : SInst<"svcvt_{d}[_s32_x2]", "2.d2.x", "f", MergeNone, "aarch64_sve_fcvts_x2", [IsStreaming], []>; + def SVCVT_S32_F32_X2 : SInst<"svcvt_s32[_{d}_x2]", "2.x2.d", "f", MergeNone, "aarch64_sve_scvtf_x2", [IsStreaming], []>; + + def SVCVT_F32_U32_X4 : SInst<"svcvt_{d}[_u32_x4]", "4.d4.u", "f", MergeNone, "aarch64_sve_fcvtu_x4", [IsStreaming], []>; + def SVCVT_U32_F32_X4 : SInst<"svcvt_u32[_{d}_x4]", "4.u4.d", "f", MergeNone, "aarch64_sve_ucvtf_x4", [IsStreaming], []>; + def SVCVT_F32_S32_X4 : SInst<"svcvt_{d}[_s32_x4]", "4.d4.x", "f", MergeNone, "aarch64_sve_fcvts_x4", [IsStreaming], []>; + def SVCVT_S32_F32_X4 : SInst<"svcvt_s32[_{d}_x4]", "4.x4.d", "f", MergeNone, "aarch64_sve_scvtf_x4", [IsStreaming], []>; +} + +// BFloat16 floating-point multiply-subtract long from single-precision. + +let TargetGuard = "sme2" in { + def SVBFMLSLB : SInst<"svbfmlslb[_{d}]", "dd$$", "f", MergeNone, "aarch64_sve_bfmlslb", [IsOverloadNone, IsStreamingCompatible], []>; + def SVBFMLSLT : SInst<"svbfmlslt[_{d}]", "dd$$", "f", MergeNone, "aarch64_sve_bfmlslt", [IsOverloadNone, IsStreamingCompatible], []>; + + def SVBFMLSLB_LANE : SInst<"svbfmlslb_lane[_{d}]", "dd$$i", "f", MergeNone, "aarch64_sve_bfmlslb_lane", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<3, ImmCheck0_7>]>; + def SVBFMLSLT_LANE : SInst<"svbfmlslt_lane[_{d}]", "dd$$i", "f", MergeNone, "aarch64_sve_bfmlslt_lane", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<3, ImmCheck0_7>]>; +} + +// +// Multi-vector saturating extract narrow +// +let TargetGuard = "sme2" in { + def SVQCVT_S16_S32_X2 : SInst<"svqcvt_s16[_{d}_x2]", "h2.d", "i", MergeNone, "aarch64_sve_sqcvt_x2", [IsStreaming], []>; + def SVQCVT_U16_U32_X2 : SInst<"svqcvt_u16[_{d}_x2]", "e2.d", "Ui", MergeNone, "aarch64_sve_uqcvt_x2", [IsStreaming], []>; + def SVQCVT_U16_S32_X2 : SInst<"svqcvt_u16[_{d}_x2]", "e2.d", "i", MergeNone, "aarch64_sve_sqcvtu_x2", [IsStreaming], []>; + + def SVQCVT_S8_S32_X4 : SInst<"svqcvt_s8[_{d}_x4]", "q4.d", "i", MergeNone, "aarch64_sve_sqcvt_x4", [IsStreaming], []>; + def SVQCVT_U8_U32_X4 : SInst<"svqcvt_u8[_{d}_x4]", "b4.d", "Ui", MergeNone, "aarch64_sve_uqcvt_x4", [IsStreaming], []>; + def SVQCVT_U8_S32_X4 : SInst<"svqcvt_u8[_{d}_x4]", "b4.d", "i", MergeNone, "aarch64_sve_sqcvtu_x4", [IsStreaming], []>; + + def SVQCVT_S16_S64_X4 : SInst<"svqcvt_s16[_{d}_x4]", "q4.d", "l", MergeNone, "aarch64_sve_sqcvt_x4", [IsStreaming], []>; + def SVQCVT_U16_U64_X4 : SInst<"svqcvt_u16[_{d}_x4]", "b4.d", "Ul", MergeNone, "aarch64_sve_uqcvt_x4", [IsStreaming], []>; + def SVQCVT_U16_S64_X4 : SInst<"svqcvt_u16[_{d}_x4]", "b4.d", "l", MergeNone, "aarch64_sve_sqcvtu_x4", [IsStreaming], []>; +} + +// +// Multi-vector saturating extract narrow and interleave +// +let TargetGuard = "sme2" in { + def SVQCVTN_S16_S32_X2 : SInst<"svqcvtn_s16[_{d}_x2]", "h2.d", "i", MergeNone, "aarch64_sve_sqcvtn_x2", [IsStreamingCompatible], []>; + def SVQCVTN_U16_U32_X2 : SInst<"svqcvtn_u16[_{d}_x2]", "e2.d", "Ui", MergeNone, "aarch64_sve_uqcvtn_x2", [IsStreamingCompatible], []>; + def SVQCVTN_U16_S32_X2 : SInst<"svqcvtn_u16[_{d}_x2]", "e2.d", "i", MergeNone, "aarch64_sve_sqcvtun_x2", [IsStreamingCompatible], []>; + + def SVQCVTN_S8_S32_X4 : SInst<"svqcvtn_s8[_{d}_x4]", "q4.d", "i", MergeNone, "aarch64_sve_sqcvtn_x4", [IsStreaming], []>; + def SVQCVTN_U8_U32_X4 : SInst<"svqcvtn_u8[_{d}_x4]", "b4.d", "Ui", MergeNone, "aarch64_sve_uqcvtn_x4", [IsStreaming], []>; + def SVQCVTN_U8_S32_X4 : SInst<"svqcvtn_u8[_{d}_x4]", "b4.d", "i", MergeNone, "aarch64_sve_sqcvtun_x4", [IsStreaming], []>; + + def SVQCVTN_S16_S64_X4 : SInst<"svqcvtn_s16[_{d}_x4]", "q4.d", "l", MergeNone, "aarch64_sve_sqcvtn_x4", [IsStreaming], []>; + def SVQCVTN_U16_U64_X4 : SInst<"svqcvtn_u16[_{d}_x4]", "b4.d", "Ul", MergeNone, "aarch64_sve_uqcvtn_x4", [IsStreaming], []>; + def SVQCVTN_U16_S64_X4 : SInst<"svqcvtn_u16[_{d}_x4]", "b4.d", "l", MergeNone, "aarch64_sve_sqcvtun_x4", [IsStreaming], []>; +} diff --git a/clang/include/clang/Basic/arm_sve_common.td b/clang/include/clang/Basic/arm_sve_common.td new file mode 100644 --- /dev/null +++ b/clang/include/clang/Basic/arm_sve_common.td @@ -0,0 +1,252 @@ +//===----------------------------------------------------------------------===// +// Instruction definitions +//===----------------------------------------------------------------------===// +// Every intrinsic subclasses "Inst". An intrinsic has a name, a prototype and +// a sequence of typespecs. +// +// The name is the base name of the intrinsic, for example "svld1". This is +// then mangled by the tblgen backend to add type information ("svld1_s16"). +// +// A typespec is a sequence of uppercase characters (modifiers) followed by one +// lowercase character. A typespec encodes a particular "base type" of the +// intrinsic. +// +// An example typespec is "Us" - unsigned short - svuint16_t. The available +// typespec codes are given below. +// +// The string given to an Inst class is a sequence of typespecs. The intrinsic +// is instantiated for every typespec in the sequence. For example "sdUsUd". +// +// The prototype is a string that defines the return type of the intrinsic +// and the type of each argument. The return type and every argument gets a +// "modifier" that can change in some way the "base type" of the intrinsic. +// +// The modifier 'd' means "default" and does not modify the base type in any +// way. The available modifiers are given below. +// +// Typespecs +// --------- +// c: char +// s: short +// i: int +// l: long +// f: float +// h: half-float +// d: double +// b: bfloat + +// Typespec modifiers +// ------------------ +// P: boolean +// Q: svcount +// U: unsigned + +// Prototype modifiers +// ------------------- +// prototype: return (arg, arg, ...) +// +// 2,3,4: array of vectors +// .: indicator for multi-vector modifier that will follow(eg.:2.x) +// v: void +// x: vector of signed integers +// u: vector of unsigned integers +// d: default +// c: const pointer type +// P: predicate type +// s: scalar of element type +// a: scalar of element type (splat to vector type) +// R: scalar of 1/2 width element type (splat to vector type) +// r: scalar of 1/4 width element type (splat to vector type) +// @: unsigned scalar of 1/4 width element type (splat to vector type) +// e: 1/2 width unsigned elements, 2x element count +// b: 1/4 width unsigned elements, 4x element count +// h: 1/2 width elements, 2x element count +// q: 1/4 width elements, 4x element count +// o: 4x width elements, 1/4 element count +// +// w: vector of element type promoted to 64bits, vector maintains +// signedness of its element type. +// f: element type promoted to uint64_t (splat to vector type) +// j: element type promoted to 64bits (splat to vector type) +// K: element type bitcast to a signed integer (splat to vector type) +// L: element type bitcast to an unsigned integer (splat to vector type) +// +// i: constant uint64_t +// k: int32_t +// l: int64_t +// m: uint32_t +// n: uint64_t +// y: bool + +// t: svint32_t +// z: svuint32_t +// g: svuint64_t +// O: svfloat16_t +// M: svfloat32_t +// N: svfloat64_t +// $: svbfloat16_t + +// J: Prefetch type (sv_prfop) +// p: pointer to element type +// A: pointer to int8_t +// B: pointer to int16_t +// C: pointer to int32_t +// D: pointer to int64_t + +// E: pointer to uint8_t +// F: pointer to uint16_t +// G: pointer to uint32_t +// H: pointer to uint64_t + +// Q: const pointer to void + +// S: const pointer to int8_t +// T: const pointer to int16_t +// U: const pointer to int32_t +// V: const pointer to int64_t +// +// W: const pointer to uint8_t +// X: const pointer to uint16_t +// Y: const pointer to uint32_t +// Z: const pointer to uint64_t + +// Prototype modifiers added for SME +// {: pointer to void +// +// Prototype modifiers added for SVE2p1 +// }: svcount_t + +class MergeType { + int Value = val; + string Suffix = suffix; +} +def MergeNone : MergeType<0>; +def MergeAny : MergeType<1, "_x">; +def MergeOp1 : MergeType<2, "_m">; +def MergeZero : MergeType<3, "_z">; +def MergeAnyExp : MergeType<4, "_x">; // Use merged builtin with explicit +def MergeZeroExp : MergeType<5, "_z">; // generation of its inactive argument. + +class EltType { + int Value = val; +} +def EltTyInvalid : EltType<0>; +def EltTyInt8 : EltType<1>; +def EltTyInt16 : EltType<2>; +def EltTyInt32 : EltType<3>; +def EltTyInt64 : EltType<4>; +def EltTyFloat16 : EltType<5>; +def EltTyFloat32 : EltType<6>; +def EltTyFloat64 : EltType<7>; +def EltTyBool8 : EltType<8>; +def EltTyBool16 : EltType<9>; +def EltTyBool32 : EltType<10>; +def EltTyBool64 : EltType<11>; +def EltTyBFloat16 : EltType<12>; + +class MemEltType { + int Value = val; +} +def MemEltTyDefault : MemEltType<0>; +def MemEltTyInt8 : MemEltType<1>; +def MemEltTyInt16 : MemEltType<2>; +def MemEltTyInt32 : MemEltType<3>; +def MemEltTyInt64 : MemEltType<4>; +def MemEltTyInt128 : MemEltType<5>; + +class FlagType { + int Value = val; +} + +// These must be kept in sync with the flags in utils/TableGen/SveEmitter.cpp +// and include/clang/Basic/TargetBuiltins.h +def NoFlags : FlagType<0x00000000>; +def FirstEltType : FlagType<0x00000001>; +// : : +// : : +def EltTypeMask : FlagType<0x0000000f>; +def FirstMemEltType : FlagType<0x00000010>; +// : : +// : : +def MemEltTypeMask : FlagType<0x00000070>; +def FirstMergeTypeMask : FlagType<0x00000080>; +// : : +// : : +def MergeTypeMask : FlagType<0x00000380>; +def FirstSplatOperand : FlagType<0x00000400>; +// : : +// These flags are used to specify which scalar operand +// needs to be duplicated/splatted into a vector. +// : : +def SplatOperandMask : FlagType<0x00001C00>; +def IsLoad : FlagType<0x00002000>; +def IsStore : FlagType<0x00004000>; +def IsGatherLoad : FlagType<0x00008000>; +def IsScatterStore : FlagType<0x00010000>; +def IsStructLoad : FlagType<0x00020000>; +def IsStructStore : FlagType<0x00040000>; +def IsZExtReturn : FlagType<0x00080000>; // Return value is sign-extend by default +def IsOverloadNone : FlagType<0x00100000>; // Intrinsic does not take any overloaded types. +def IsOverloadWhile : FlagType<0x00200000>; // Use {default type, typeof(operand1)} as overloaded types. +def IsOverloadWhileRW : FlagType<0x00400000>; // Use {pred(default type), typeof(operand0)} as overloaded types. +def IsOverloadCvt : FlagType<0x00800000>; // Use {typeof(operand0), typeof(last operand)} as overloaded types. +def OverloadKindMask : FlagType<0x00E00000>; // When the masked values are all '0', the default type is used as overload type. +def IsByteIndexed : FlagType<0x01000000>; +def IsAppendSVALL : FlagType<0x02000000>; // Appends SV_ALL as the last operand. +def IsInsertOp1SVALL : FlagType<0x04000000>; // Inserts SV_ALL as the second operand. +def IsPrefetch : FlagType<0x08000000>; // Contiguous prefetches. +def IsGatherPrefetch : FlagType<0x10000000>; +def ReverseCompare : FlagType<0x20000000>; // Compare operands must be swapped. +def ReverseUSDOT : FlagType<0x40000000>; // Unsigned/signed operands must be swapped. +def IsUndef : FlagType<0x80000000>; // Codegen `undef` of given type. +def IsTupleCreate : FlagType<0x100000000>; +def IsTupleGet : FlagType<0x200000000>; +def IsTupleSet : FlagType<0x400000000>; +def IsStreaming : FlagType<0x800000000>; +def IsStreamingCompatible : FlagType<0x1000000000>; +def IsSharedZA : FlagType<0x2000000000>; +def IsPreservedZA : FlagType<0x4000000000>; +def IsReadZA : FlagType<0x8000000000>; +def IsWriteZA : FlagType<0x10000000000>; +def IsZASliceBaseOffsetIntr : FlagType<0x20000000000>; +def IsTileBaseOffsetIntr : FlagType<0x40000000000>; +def ReverseMergeAnyBinOp : FlagType<0x80000000000>; // e.g. Implement SUBR_X using SUB_X. +def ReverseMergeAnyAccOp : FlagType<0x100000000000>; // e.g. Implement MSB_X using MLS_X. + +// These must be kept in sync with the flags in include/clang/Basic/TargetBuiltins.h +class ImmCheckType { + int Value = val; +} +def ImmCheck0_31 : ImmCheckType<0>; // 0..31 (used for e.g. predicate patterns) +def ImmCheck1_16 : ImmCheckType<1>; // 1..16 +def ImmCheckExtract : ImmCheckType<2>; // 0..(2048/sizeinbits(elt) - 1) +def ImmCheckShiftRight : ImmCheckType<3>; // 1..sizeinbits(elt) +def ImmCheckShiftRightNarrow : ImmCheckType<4>; // 1..sizeinbits(elt)/2 +def ImmCheckShiftLeft : ImmCheckType<5>; // 0..(sizeinbits(elt) - 1) +def ImmCheck0_7 : ImmCheckType<6>; // 0..7 +def ImmCheckLaneIndex : ImmCheckType<7>; // 0..(128/(1*sizeinbits(elt)) - 1) +def ImmCheckLaneIndexCompRotate : ImmCheckType<8>; // 0..(128/(2*sizeinbits(elt)) - 1) +def ImmCheckLaneIndexDot : ImmCheckType<9>; // 0..(128/(4*sizeinbits(elt)) - 1) +def ImmCheckComplexRot90_270 : ImmCheckType<10>; // [90,270] +def ImmCheckComplexRotAll90 : ImmCheckType<11>; // [0, 90, 180,270] +def ImmCheck0_13 : ImmCheckType<12>; // 0..13 +def ImmCheck0_1 : ImmCheckType<13>; // 0..1 +def ImmCheck0_2 : ImmCheckType<14>; // 0..2 +def ImmCheck0_3 : ImmCheckType<15>; // 0..3 +def ImmCheck0 : ImmCheckType<16>; // Must be 0. +def ImmCheck0_15 : ImmCheckType<17>; // 0..15 +def ImmCheck0_255 : ImmCheckType<18>; // 0..255 +def ImmCheck0_2_Mul2 : ImmCheckType<19>; // 0, 2 +def ImmCheck0_6_Mul2 : ImmCheckType<20>; // 0, 2, 4, 6 +def ImmCheck0_14_Mul2 : ImmCheckType<21>; // 0, 2, .., 14 +def ImmCheck0_4_Mul4 : ImmCheckType<22>; // 0, 4 +def ImmCheck0_12_Mul4 : ImmCheckType<23>; // 0, 4, 8, 12 +def ImmCheck2_4_Mul2 : ImmCheckType<24>; // 2, 4 +def ImmCheck0_56_Mul8 : ImmCheckType<25>; // 0, 8, .., 56 + + +class ImmCheck { + int Arg = arg; + int EltSizeArg = eltSizeArg; + ImmCheckType Kind = kind; +} diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -7014,6 +7014,8 @@ NestedNameSpecInfo &IdInfo, bool EnteringContext); + bool IsInvalidSMECallConversion(QualType FromType, QualType ToType); + /// The parser has parsed a nested-name-specifier /// 'template[opt] template-name < template-args >::'. /// @@ -13515,7 +13517,10 @@ bool CheckNeonBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, CallExpr *TheCall); bool CheckMVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall); + bool ParseSVEImmChecks(CallExpr *TheCall, + SmallVector, 3> &ImmChecks); bool CheckSVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall); + bool CheckSMEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall); bool CheckCDEBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, CallExpr *TheCall); bool CheckARMCoprocessorImmediate(const TargetInfo &TI, const Expr *CoprocArg, diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -11440,6 +11440,10 @@ Type = Context.getScalableVectorType(ElementType, NumElements); break; } + case 'Q': { + Type = Context.SveCountTy; + break; + } case 'V': { char *End; unsigned NumElements = strtoul(Str, &End, 10); diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -3344,6 +3344,13 @@ argSlot[i] = params[i]; } + // Propagate the SME ACLE attributes. + if (epi.AArch64SMEAttributes != SME_NormalFunction) { + auto &ExtraBits = *getTrailingObjects(); + ExtraBits.AArch64SMEAttributes = epi.AArch64SMEAttributes; + } + + // Fill in the exception type array if present. if (getExceptionSpecType() == EST_Dynamic) { auto &ExtraBits = *getTrailingObjects(); @@ -3537,6 +3544,8 @@ for (unsigned i = 0; i != NumParams; ++i) ID.AddInteger(epi.ExtParameterInfos[i].getOpaqueValue()); } + ID.AddInteger(epi.AArch64SMEAttributes); + epi.ExtInfo.Profile(ID); ID.AddBoolean(epi.HasTrailingReturn); } diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp --- a/clang/lib/AST/TypePrinter.cpp +++ b/clang/lib/AST/TypePrinter.cpp @@ -938,6 +938,21 @@ FunctionType::ExtInfo Info = T->getExtInfo(); + if ((T->getAArch64SMEAttributes() & + FunctionType::SME_PStateSMCompatibleMask) && + !InsideCCAttribute) + OS << " __attribute__((arm_streaming_compatible))"; + if ((T->getAArch64SMEAttributes() & FunctionType::SME_PStateSMEnabledMask) && + !InsideCCAttribute) + OS << " __attribute__((arm_streaming))"; + if ((T->getAArch64SMEAttributes() & FunctionType::SME_PStateZASharedMask) && + !InsideCCAttribute) + OS << " __attribute__((arm_shared_za))"; + if ((T->getAArch64SMEAttributes() & + FunctionType::SME_PStateZAPreservedMask) && + !InsideCCAttribute) + OS << " __attribute__((arm_preserves_za))"; + printFunctionAfter(Info, OS); if (!T->getMethodQuals().empty()) @@ -1819,6 +1834,10 @@ break; } case attr::AArch64VectorPcs: OS << "aarch64_vector_pcs"; break; + case attr::ArmStreaming: OS << "arm_streaming"; break; + case attr::ArmStreamingCompatible: OS << "arm_streaming_compatible"; break; + case attr::ArmPreservesZA: OS << "arm_preserves_za"; break; + case attr::ArmSharedZA: OS << "arm_shared_za"; break; case attr::AArch64SVEPcs: OS << "aarch64_sve_pcs"; break; case attr::AMDGPUKernelCall: OS << "amdgpu_kernel"; break; case attr::IntelOclBicc: OS << "inteloclbicc"; break; diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h --- a/clang/lib/Basic/Targets/AArch64.h +++ b/clang/lib/Basic/Targets/AArch64.h @@ -70,6 +70,7 @@ bool HasSME = false; bool HasSMEF64F64 = false; bool HasSMEI16I64 = false; + bool HasSME2 = false; bool HasSB = false; bool HasPredRes = false; bool HasSSBS = false; diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp --- a/clang/lib/Basic/Targets/AArch64.cpp +++ b/clang/lib/Basic/Targets/AArch64.cpp @@ -37,6 +37,12 @@ {#ID, TYPE, ATTRS, FEATURE, HeaderDesc::NO_HEADER, ALL_LANGUAGES}, #include "clang/Basic/BuiltinsSVE.def" +#define BUILTIN(ID, TYPE, ATTRS) \ + {#ID, TYPE, ATTRS, nullptr, HeaderDesc::NO_HEADER, ALL_LANGUAGES}, +#define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) \ + {#ID, TYPE, ATTRS, FEATURE, HeaderDesc::NO_HEADER, ALL_LANGUAGES}, +#include "clang/Basic/BuiltinsSME.def" + #define BUILTIN(ID, TYPE, ATTRS) \ {#ID, TYPE, ATTRS, nullptr, HeaderDesc::NO_HEADER, ALL_LANGUAGES}, #define LANGBUILTIN(ID, TYPE, ATTRS, LANG) \ @@ -409,6 +415,18 @@ if (HasSVE2 && HasSVE2AES) Builder.defineMacro("__ARM_FEATURE_SVE2_AES", "1"); + if (HasSME) + Builder.defineMacro("__ARM_FEATURE_SME", "1"); + + if (HasSMEI16I64) + Builder.defineMacro("__ARM_FEATURE_SME_I16I64", "1"); + + if (HasSMEF64F64) + Builder.defineMacro("__ARM_FEATURE_SME_F64F64", "1"); + + if (HasSME2) + Builder.defineMacro("__ARM_FEATURE_SME2", "1"); + if (HasSVE2 && HasSVE2BitPerm) Builder.defineMacro("__ARM_FEATURE_SVE2_BITPERM", "1"); @@ -668,6 +686,7 @@ .Case("sve2-sha3", FPU & SveMode && HasSVE2SHA3) .Case("sve2-sm4", FPU & SveMode && HasSVE2SM4) .Case("sme", HasSME) + .Case("sme2", HasSME2) .Case("sme-f64f64", HasSMEF64F64) .Case("sme-i16i64", HasSMEI16I64) .Cases("memtag", "memtag2", HasMTE) @@ -782,18 +801,31 @@ HasMatmulFP64 = true; } if (Feature == "+sme") { + FPU |= SveMode; HasSME = true; HasBFloat16 = true; + HasFullFP16 = true; } if (Feature == "+sme-f64f64") { + FPU |= SveMode; HasSME = true; HasSMEF64F64 = true; HasBFloat16 = true; + HasFullFP16 = true; } if (Feature == "+sme-i16i64") { + FPU |= SveMode; HasSME = true; HasSMEI16I64 = true; HasBFloat16 = true; + HasFullFP16 = true; + } + if (Feature == "+sme2") { + FPU |= SveMode; + HasSME = true; + HasSME2 = true; + HasBFloat16 = true; + HasFullFP16 = true; } if (Feature == "+sb") HasSB = true; diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -6743,11 +6743,29 @@ #undef SVEMAP1 #undef SVEMAP2 +#define SMEMAP1(NameBase, LLVMIntrinsic, TypeModifier) \ + { \ +#NameBase, SME::BI__builtin_sme_##NameBase, Intrinsic::LLVMIntrinsic, 0, \ + TypeModifier \ + } + +#define SMEMAP2(NameBase, TypeModifier) \ + { #NameBase, SME::BI__builtin_sme_##NameBase, 0, 0, TypeModifier } +static const ARMVectorIntrinsicInfo AArch64SMEIntrinsicMap[] = { +#define GET_SME_LLVM_INTRINSIC_MAP +#include "clang/Basic/arm_sme_builtin_cg.inc" +#undef GET_SME_LLVM_INTRINSIC_MAP +}; + +#undef SMEMAP1 +#undef SMEMAP2 + static bool NEONSIMDIntrinsicsProvenSorted = false; static bool AArch64SIMDIntrinsicsProvenSorted = false; static bool AArch64SISDIntrinsicsProvenSorted = false; static bool AArch64SVEIntrinsicsProvenSorted = false; +static bool AArch64SMEIntrinsicsProvenSorted = false; static const ARMVectorIntrinsicInfo * findARMVectorIntrinsicInMap(ArrayRef IntrinsicMap, @@ -8879,6 +8897,8 @@ return Builder.getInt32Ty(); case SVETypeFlags::MemEltTyInt64: return Builder.getInt64Ty(); + case SVETypeFlags::MemEltTyInt128: + return Builder.getInt128Ty(); } llvm_unreachable("Unknown MemEltType"); } @@ -9005,6 +9025,10 @@ // the elements of the specified datatype. Value *CodeGenFunction::EmitSVEPredicateCast(Value *Pred, llvm::ScalableVectorType *VTy) { + if (isa(Pred->getType()) && + cast(Pred->getType())->getName() == "aarch64.svcount") + return Pred; + auto *RTy = llvm::VectorType::get(IntegerType::get(getLLVMContext(), 1), VTy); if (Pred->getType() == RTy) return Pred; @@ -9014,6 +9038,7 @@ switch (VTy->getMinNumElements()) { default: llvm_unreachable("unsupported element count!"); + case 1: case 2: case 4: case 8: @@ -9180,12 +9205,16 @@ unsigned N; switch (IntID) { case Intrinsic::aarch64_sve_ld2_sret: + case Intrinsic::aarch64_sve_ld1_pn_vg2: + case Intrinsic::aarch64_sve_ldnt1_pn_vg2: N = 2; break; case Intrinsic::aarch64_sve_ld3_sret: N = 3; break; case Intrinsic::aarch64_sve_ld4_sret: + case Intrinsic::aarch64_sve_ld1_pn_vg4: + case Intrinsic::aarch64_sve_ldnt1_pn_vg4: N = 4; break; default: @@ -9194,13 +9223,14 @@ auto RetTy = llvm::VectorType::get(VTy->getElementType(), VTy->getElementCount() * N); - Value *Predicate = EmitSVEPredicateCast(Ops[0], VTy); + Value *Predicate = EmitSVEPredicateCast(Ops[0], VTy); Value *BasePtr= Builder.CreateBitCast(Ops[1], VecPtrTy); - // Does the load have an offset? if (Ops.size() > 2) BasePtr = Builder.CreateGEP(VTy, BasePtr, Ops[2]); + //Value *Offset = Ops.size() > 2 ? Ops[2] : Builder.getInt32(0); + //BasePtr = Builder.CreateGEP(VTy, BasePtr, Offset); BasePtr = Builder.CreateBitCast(BasePtr, EltPtrTy); Function *F = CGM.getIntrinsic(IntID, {VTy}); Value *Call = Builder.CreateCall(F, {Predicate, BasePtr}); @@ -9224,12 +9254,16 @@ unsigned N; switch (IntID) { case Intrinsic::aarch64_sve_st2: + case Intrinsic::aarch64_sve_st1_pn_vg2: + case Intrinsic::aarch64_sve_stnt1_pn_vg2: N = 2; break; case Intrinsic::aarch64_sve_st3: N = 3; break; case Intrinsic::aarch64_sve_st4: + case Intrinsic::aarch64_sve_st1_pn_vg4: + case Intrinsic::aarch64_sve_stnt1_pn_vg4: N = 4; break; default: @@ -9239,25 +9273,20 @@ Value *Predicate = EmitSVEPredicateCast(Ops[0], VTy); Value *BasePtr = Builder.CreateBitCast(Ops[1], VecPtrTy); - // Does the store have an offset? - if (Ops.size() > 3) + if (Ops.size() > (2 + N)) BasePtr = Builder.CreateGEP(VTy, BasePtr, Ops[2]); - BasePtr = Builder.CreateBitCast(BasePtr, EltPtrTy); - Value *Val = Ops.back(); // The llvm.aarch64.sve.st2/3/4 intrinsics take legal part vectors, so we // need to break up the tuple vector. SmallVector Operands; - unsigned MinElts = VTy->getElementCount().getKnownMinValue(); - for (unsigned I = 0; I < N; ++I) { - Value *Idx = ConstantInt::get(CGM.Int64Ty, I * MinElts); - Operands.push_back(Builder.CreateExtractVector(VTy, Val, Idx)); - } + for (unsigned I = Ops.size() - N; I < Ops.size(); ++I) + Operands.push_back(Ops[I]); Operands.append({Predicate, BasePtr}); - Function *F = CGM.getIntrinsic(IntID, { VTy }); + return Builder.CreateCall(F, Operands); + } // SVE2's svpmullb and svpmullt builtins are similar to the svpmullb_pair and @@ -9430,6 +9459,165 @@ return {DefaultType}; } +static llvm::Value *getSMESliceIndex(CGBuilderTy &Builder, llvm::Value *Base, + llvm::Value *Offset) { + llvm::APInt OffsetVal = cast(Offset)->getValue().trunc(32); + Offset = ConstantInt::get(Base->getType(), OffsetVal); + return Builder.CreateAdd(Base, Offset); +} + +Value *CodeGenFunction::EmitSMELoadStore(const SVETypeFlags &TypeFlags, + unsigned BuiltinID, + SmallVectorImpl &Ops) { + Function *F = CGM.getIntrinsic(BuiltinID); + Function *Cntsb = CGM.getIntrinsic(Intrinsic::aarch64_sme_cntsb); + + // We support 3 classes of builtins here: + // * 3-op forms: + // svldr_vnum_za(slice_base, slice_offset, ptr) + // svstr_vnum_za(slice_base, slice_offset, ptr) + if (Ops.size() == 3) { + // For the 3-op form, slice_offset == vnum. + Value *VNum = Ops[1]; + llvm::Value *SVL = Builder.CreateCall(Cntsb); + Value *Offset = + Builder.CreateMul(Builder.CreateZExtOrTrunc(VNum, SVL->getType()), SVL); + Value *Ptr = Builder.CreateGEP(Builder.getInt8Ty(), /*Ptr*/ Ops[2], Offset); + Value *Slice = getSMESliceIndex(Builder, /*Base*/ Ops[0], VNum); + return Builder.CreateCall(F, {Slice, Ptr}); + } + + Value *Ptr = Ops[4]; + Value *Slice = getSMESliceIndex(Builder, Ops[1], Ops[2]); + llvm::ScalableVectorType *OverloadedTy = + getSVEVectorForElementType(SVEBuiltinMemEltTy(TypeFlags)); + Value *Pn = EmitSVEPredicateCast(/*Pn*/ Ops[3], OverloadedTy); + + // * 5-op forms: + // svld1_(ver|hor)_zaX(tile, slice_base, slice_offset, pn, ptr) + // svst1_(ver|hor)_zaX(tile, slice_base, slice_offset, pn, ptr) + if (Ops.size() == 5) + return Builder.CreateCall(F, {Pn, Ptr, /*Tile*/ Ops[0], Slice}); + + // + // * 6-op forms: + // svld1_(ver|hor)_vnum_zaX(tile, slice_base, slice_offset, pn, ptr, vnum) + // svst1_(ver|hor)_vnum_zaX(tile, slice_base, slice_offset, pn, ptr, vnum) + assert (Ops.size() == 6 && "Unexpected number of ops"); + Value *SVL = Builder.CreateCall(Cntsb); + Value *Offset = Builder.CreateMul( + Builder.CreateZExtOrTrunc(/*VNum*/ Ops[5], SVL->getType()), SVL); + Ptr = Builder.CreateGEP(Builder.getInt8Ty(), Ptr, Offset); + return Builder.CreateCall(F, {Pn, Ptr, /*Tile*/ Ops[0], Slice}); +} + +Value *CodeGenFunction::FormMultiVectorResult(Value *Call) { + // Multi-vector results should be broken up into a single (wide) result + // vector. + if (auto *StructTy = dyn_cast(Call->getType())) { + if (auto *VTy = + dyn_cast(StructTy->getTypeAtIndex(0U))) { + unsigned N = StructTy->getNumElements(); + + // We may need to emit a cast to a svbool_t + bool IsPredTy = VTy->getElementType()->isIntegerTy(1); + unsigned MinElts = IsPredTy ? 16 : VTy->getMinNumElements(); + + ScalableVectorType *WideVTy = + ScalableVectorType::get(VTy->getElementType(), MinElts * N); + Value *Ret = llvm::PoisonValue::get(WideVTy); + for (unsigned I = 0; I < N; ++I) { + Value *SRet = Builder.CreateExtractValue(Call, I); + assert(SRet->getType() == VTy && "Unexpected type for result value"); + Value *Idx = ConstantInt::get(CGM.Int64Ty, I * MinElts); + + if (IsPredTy) + SRet = EmitSVEPredicateCast( + SRet, ScalableVectorType::get(Builder.getInt1Ty(), 16)); + + Ret = Builder.CreateInsertVector(WideVTy, Ret, SRet, Idx); + } + Call = Ret; + } + } + + return Call; +} + +Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID, + const CallExpr *E) { + + // Find out if any arguments are required to be integer constant expressions. + unsigned ICEArguments = 0; + ASTContext::GetBuiltinTypeError Error; + getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments); + assert(Error == ASTContext::GE_None && "Should not codegen an error"); + + auto *Builtin = findARMVectorIntrinsicInMap(AArch64SMEIntrinsicMap, BuiltinID, + AArch64SMEIntrinsicsProvenSorted); + + llvm::SmallVector Ops; + SVETypeFlags TypeFlags(Builtin->TypeModifier); + GetAArch64SMEProcessedOperands(BuiltinID, E, Ops, TypeFlags); + + if (TypeFlags.isLoad() || TypeFlags.isStore()) + return EmitSMELoadStore(TypeFlags, Builtin->LLVMIntrinsic, Ops); + + // Should not happen! + if (Builtin->LLVMIntrinsic == 0) + return nullptr; + + if (TypeFlags.isZASliceBaseOffsetIntr() || TypeFlags.isTileBaseOffsetIntr()) { + unsigned BaseIdx = TypeFlags.isZASliceBaseOffsetIntr() ? 0 : 1; + + llvm::Value *SliceBase = Ops[BaseIdx]; + llvm::Value *SliceOff = Ops[BaseIdx + 1]; + llvm::Value *Slice = getSMESliceIndex(Builder, SliceBase, SliceOff); + Ops[BaseIdx] = Slice; + Ops.erase(Ops.begin() + BaseIdx + 1); + } + + // Predicates must match the main datatype. + for (unsigned i = 0, e = Ops.size(); i != e; ++i) + if (auto PredTy = dyn_cast(Ops[i]->getType())) + if (PredTy->getElementType()->isIntegerTy(1)) { + PredTy = getSVEType(TypeFlags); + Ops[i] = EmitSVEPredicateCast(Ops[i], PredTy); + } + + if (TypeFlags.isReadZA() || TypeFlags.isWriteZA()) { + llvm::ScalableVectorType *FnType = getSVEType(TypeFlags); + Function *F = CGM.getIntrinsic(Builtin->LLVMIntrinsic, {FnType}); + + // FIXME! We probably shouldn't be adding the slice_base (Op[1]) to the + // slice_offset (Op[2]) here. Instead I think it makes sense to change the + // LLVM intrinsic to match the ACLE intrinsic. + llvm::Value *SliceBase = TypeFlags.isReadZA() ? Ops[3] : Ops[1]; + llvm::Value *SliceOff = TypeFlags.isReadZA() ? Ops[4] : Ops[2]; + llvm::Value *Slice = getSMESliceIndex(Builder, SliceBase, SliceOff); + + if (TypeFlags.isReadZA()) + return Builder.CreateCall( + F, {/*Passthru*/ Ops[0], /*Pg*/ Ops[1], /*Tile*/ Ops[2], Slice}); + else + return Builder.CreateCall( + F, {/*Tile*/ Ops[0], Slice, /*Pg*/ Ops[3], /*Write data*/ Ops[4]}); + } + + Function *F = + TypeFlags.isOverloadNone() + ? CGM.getIntrinsic(Builtin->LLVMIntrinsic) + : CGM.getIntrinsic(Builtin->LLVMIntrinsic, {getSVEType(TypeFlags)}); + Value *Call = Builder.CreateCall(F, Ops); + + // Predicate results must be converted to svbool_t. + if (auto PredTy = dyn_cast(Call->getType())) + if (PredTy->getScalarType()->isIntegerTy(1)) + Call = EmitSVEPredicateCast(Call, ScalableVectorType::get(Builder.getInt1Ty(), 16)); + + return FormMultiVectorResult(Call); +} + Value *CodeGenFunction::EmitSVETupleSetOrGet(const SVETypeFlags &TypeFlags, llvm::Type *Ty, ArrayRef Ops) { @@ -9463,24 +9651,34 @@ return Call; } -Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID, - const CallExpr *E) { +void CodeGenFunction::GetAArch64SMEProcessedOperands( + unsigned BuiltinID, const CallExpr *E, SmallVectorImpl &Ops, + SVETypeFlags TypeFlags) { // Find out if any arguments are required to be integer constant expressions. unsigned ICEArguments = 0; ASTContext::GetBuiltinTypeError Error; getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments); assert(Error == ASTContext::GE_None && "Should not codegen an error"); - llvm::Type *Ty = ConvertType(E->getType()); - if (BuiltinID >= SVE::BI__builtin_sve_reinterpret_s8_s8 && - BuiltinID <= SVE::BI__builtin_sve_reinterpret_f64_f64) { - Value *Val = EmitScalarExpr(E->getArg(0)); - return EmitSVEReinterpret(Val, Ty); - } + bool IsTupleGetOrSet = TypeFlags.isTupleSet() || TypeFlags.isTupleGet(); - llvm::SmallVector Ops; for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) { - if ((ICEArguments & (1 << i)) == 0) + if (!IsTupleGetOrSet && (ICEArguments & (1 << i)) == 0) { + Value *Arg = EmitScalarExpr(E->getArg(i)); + if (auto *VTy = dyn_cast(Arg->getType())) { + unsigned MinElts = VTy->getMinNumElements(); + bool IsPred = VTy->getElementType()->isIntegerTy(1); + unsigned N = + (MinElts * VTy->getScalarSizeInBits()) / (IsPred ? 16 : 128); + for (unsigned I = 0; I < N; ++I) { + Value *Idx = ConstantInt::get(CGM.Int64Ty, (I * MinElts) / N); + auto *NewVTy = + ScalableVectorType::get(VTy->getElementType(), MinElts / N); + Ops.push_back(Builder.CreateExtractVector(NewVTy, Arg, Idx)); + } + } else + Ops.push_back(Arg); + } else if ((ICEArguments & (1 << i)) == 0) Ops.push_back(EmitScalarExpr(E->getArg(i))); else { // If this is required to be a constant, constant fold it so that we know @@ -9497,13 +9695,30 @@ } } + return; +} + +Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID, + const CallExpr *E) { + llvm::Type *Ty = ConvertType(E->getType()); + if (BuiltinID >= SVE::BI__builtin_sve_reinterpret_s8_s8 && + BuiltinID <= SVE::BI__builtin_sve_reinterpret_f64_f64) { + Value *Val = EmitScalarExpr(E->getArg(0)); + return EmitSVEReinterpret(Val, Ty); + } + + auto *Builtin = findARMVectorIntrinsicInMap(AArch64SVEIntrinsicMap, BuiltinID, AArch64SVEIntrinsicsProvenSorted); + llvm::SmallVector Ops; SVETypeFlags TypeFlags(Builtin->TypeModifier); + GetAArch64SMEProcessedOperands(BuiltinID, E, Ops, TypeFlags); + if (TypeFlags.isLoad()) return EmitSVEMaskedLoad(E, Ty, Ops, Builtin->LLVMIntrinsic, TypeFlags.isZExtReturn()); else if (TypeFlags.isStore()) + return EmitSVEMaskedStore(E, Ops, Builtin->LLVMIntrinsic); else if (TypeFlags.isGatherLoad()) return EmitSVEGatherLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic); @@ -9513,10 +9728,10 @@ return EmitSVEPrefetchLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic); else if (TypeFlags.isGatherPrefetch()) return EmitSVEGatherPrefetch(TypeFlags, Ops, Builtin->LLVMIntrinsic); - else if (TypeFlags.isStructLoad()) - return EmitSVEStructLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic); - else if (TypeFlags.isStructStore()) - return EmitSVEStructStore(TypeFlags, Ops, Builtin->LLVMIntrinsic); + else if (TypeFlags.isStructLoad()) + return EmitSVEStructLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic); + else if (TypeFlags.isStructStore()) + return EmitSVEStructStore(TypeFlags, Ops, Builtin->LLVMIntrinsic); else if (TypeFlags.isTupleSet() || TypeFlags.isTupleGet()) return EmitSVETupleSetOrGet(TypeFlags, Ty, Ops); else if (TypeFlags.isTupleCreate()) @@ -9567,6 +9782,17 @@ Ops[1] = Builder.CreateSelect(Ops[0], Ops[1], SplatZero); } + // Needed for the psel instruction. + // FIXME: This isn't really a ZA slice, but rather a lane index that takes + // the same form of w12-w15 reg + immediate offset. + if (TypeFlags.isZASliceBaseOffsetIntr()) { + llvm::Value *SliceBase = Ops[2]; + llvm::Value *SliceOff = Ops[3]; + llvm::Value *Slice = getSMESliceIndex(Builder, SliceBase, SliceOff); + Ops[2] = Slice; + Ops.erase(Ops.begin() + 3); + } + Function *F = CGM.getIntrinsic(Builtin->LLVMIntrinsic, getSVEOverloadTypes(TypeFlags, Ty, Ops)); Value *Call = Builder.CreateCall(F, Ops); @@ -9576,13 +9802,52 @@ if (PredTy->getScalarType()->isIntegerTy(1)) Call = EmitSVEPredicateCast(Call, cast(Ty)); - return Call; + return FormMultiVectorResult(Call); } switch (BuiltinID) { default: return nullptr; + case SVE::BI__builtin_sve_svreinterpret_b: { + auto SVCountTy = llvm::TargetExtType::get(getLLVMContext(), "aarch64.svcount"); + Function *CastFromSVCountF = CGM.getIntrinsic( + Intrinsic::aarch64_sve_convert_to_svbool, SVCountTy); + return Builder.CreateCall(CastFromSVCountF, Ops[0]); + } + case SVE::BI__builtin_sve_svreinterpret_c: { + auto SVCountTy = llvm::TargetExtType::get(getLLVMContext(), "aarch64.svcount"); + Function *CastToSVCountF = CGM.getIntrinsic( + Intrinsic::aarch64_sve_convert_from_svbool, SVCountTy); + return Builder.CreateCall(CastToSVCountF, Ops[0]); + } + + case SVE::BI__builtin_sve_svpsel_lane_b8: + case SVE::BI__builtin_sve_svpsel_lane_b16: + case SVE::BI__builtin_sve_svpsel_lane_b32: + case SVE::BI__builtin_sve_svpsel_lane_b64: + case SVE::BI__builtin_sve_svpsel_lane_c8: + case SVE::BI__builtin_sve_svpsel_lane_c16: + case SVE::BI__builtin_sve_svpsel_lane_c32: + case SVE::BI__builtin_sve_svpsel_lane_c64: { + bool IsSVCount = isa(Ops[0]->getType()); + assert(((!IsSVCount || cast(Ops[0]->getType())->getName() == + "aarch64.svcount")) && + "Unexpected TargetExtType"); + auto SVCountTy = llvm::TargetExtType::get(getLLVMContext(), "aarch64.svcount"); + Function *CastFromSVCountF = CGM.getIntrinsic( + Intrinsic::aarch64_sve_convert_to_svbool, SVCountTy); + Function *CastToSVCountF = CGM.getIntrinsic( + Intrinsic::aarch64_sve_convert_from_svbool, SVCountTy); + + auto OverloadedTy = getSVEType(SVETypeFlags(Builtin->TypeModifier)); + Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sve_psel, OverloadedTy); + llvm::Value *Ops0 = IsSVCount ? Builder.CreateCall(CastFromSVCountF, Ops[0]) : Ops[0]; + llvm::Value *Ops1 = EmitSVEPredicateCast(Ops[1], OverloadedTy); + llvm::Value *Ops2 = getSMESliceIndex(Builder, Ops[2], Ops[3]); + llvm::Value *PSel = Builder.CreateCall(F, {Ops0, Ops1, Ops2}); + return IsSVCount ? Builder.CreateCall(CastToSVCountF, PSel) : PSel; + } case SVE::BI__builtin_sve_svmov_b_z: { // svmov_b_z(pg, op) <=> svand_b_z(pg, op, op) SVETypeFlags TypeFlags(Builtin->TypeModifier); @@ -9704,6 +9969,13 @@ case SVE::BI__builtin_sve_svpfalse_b: return ConstantInt::getFalse(Ty); + case SVE::BI__builtin_sve_svpfalse_c: { + auto SVBoolTy = ScalableVectorType::get(Builder.getInt1Ty(), 16); + Function *CastToSVCountF = CGM.getIntrinsic( + Intrinsic::aarch64_sve_convert_from_svbool, Ty); + return Builder.CreateCall(CastToSVCountF, ConstantInt::getFalse(SVBoolTy)); + } + case SVE::BI__builtin_sve_svlen_bf16: case SVE::BI__builtin_sve_svlen_f16: case SVE::BI__builtin_sve_svlen_f32: @@ -9739,13 +10011,8 @@ case SVE::BI__builtin_sve_svtbl2_f64: { SVETypeFlags TF(Builtin->TypeModifier); auto VTy = cast(getSVEType(TF)); - Value *V0 = Builder.CreateExtractVector(VTy, Ops[0], - ConstantInt::get(CGM.Int64Ty, 0)); - unsigned MinElts = VTy->getMinNumElements(); - Value *V1 = Builder.CreateExtractVector( - VTy, Ops[0], ConstantInt::get(CGM.Int64Ty, MinElts)); Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sve_tbl2, VTy); - return Builder.CreateCall(F, {V0, V1, Ops[1]}); + return Builder.CreateCall(F, Ops); } case SVE::BI__builtin_sve_svset_neonq_s8: @@ -9808,6 +10075,10 @@ BuiltinID <= clang::AArch64::LastSVEBuiltin) return EmitAArch64SVEBuiltinExpr(BuiltinID, E); + if (BuiltinID >= AArch64::FirstSMEBuiltin && + BuiltinID <= AArch64::LastSMEBuiltin) + return EmitAArch64SMEBuiltinExpr(BuiltinID, E); + unsigned HintID = static_cast(-1); switch (BuiltinID) { default: break; @@ -9841,6 +10112,26 @@ return Builder.CreateCall(F, llvm::ConstantInt::get(Int32Ty, HintID)); } + if (BuiltinID == clang::AArch64::BI__builtin_arm_get_sme_state) { + // Create call to __arm_sme_state and store the results to the two pointers. + CallInst *CI = EmitRuntimeCall(CGM.CreateRuntimeFunction( + llvm::FunctionType::get(StructType::get(CGM.Int64Ty, CGM.Int64Ty), {}, + false), + "__arm_sme_state")); + auto Attrs = + AttributeList() + .addFnAttribute(getLLVMContext(), "aarch64_pstate_sm_compatible") + .addFnAttribute(getLLVMContext(), "aarch64_pstate_za_preserved"); + CI->setAttributes(Attrs); + CI->setCallingConv( + llvm::CallingConv:: + AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2); + Builder.CreateStore(Builder.CreateExtractValue(CI, 0), + EmitPointerWithAlignment(E->getArg(0))); + return Builder.CreateStore(Builder.CreateExtractValue(CI, 1), + EmitPointerWithAlignment(E->getArg(1))); + } + if (BuiltinID == clang::AArch64::BI__builtin_arm_rbit) { assert((getContext().getTypeSize(E->getType()) == 32) && "rbit of unusual size!"); diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -1768,6 +1768,15 @@ if (!isUnresolvedExceptionSpec(FPT->getExceptionSpecType()) && FPT->isNothrow()) FuncAttrs.addAttribute(llvm::Attribute::NoUnwind); + + if (FPT->getAArch64SMEAttributes() & FunctionType::SME_PStateSMEnabledMask) + FuncAttrs.addAttribute("aarch64_pstate_sm_enabled"); + if (FPT->getAArch64SMEAttributes() & FunctionType::SME_PStateSMCompatibleMask) + FuncAttrs.addAttribute("aarch64_pstate_sm_compatible"); + if (FPT->getAArch64SMEAttributes() & FunctionType::SME_PStateZASharedMask) + FuncAttrs.addAttribute("aarch64_pstate_za_shared"); + if (FPT->getAArch64SMEAttributes() & FunctionType::SME_PStateZAPreservedMask) + FuncAttrs.addAttribute("aarch64_pstate_za_preserved"); } static void AddAttributesFromAssumes(llvm::AttrBuilder &FuncAttrs, @@ -2368,6 +2377,13 @@ llvm::toStringRef(CodeGenOpts.UniformWGSize)); } } + + + if (TargetDecl->hasAttr()) + FuncAttrs.addAttribute("aarch64_pstate_sm_body"); + + if (TargetDecl->hasAttr()) + FuncAttrs.addAttribute("aarch64_pstate_za_new"); } // Attach "no-builtins" attributes to: diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -4258,7 +4258,17 @@ llvm::Value *EmitSVEStructStore(const SVETypeFlags &TypeFlags, SmallVectorImpl &Ops, unsigned IntID); + + void GetAArch64SMEProcessedOperands(unsigned BuiltinID, const CallExpr *E, + SmallVectorImpl &Ops, + SVETypeFlags TypeFlags); + llvm::Value *FormMultiVectorResult(llvm::Value *Call); + llvm::Value *EmitAArch64SVEBuiltinExpr(unsigned BuiltinID, const CallExpr *E); + llvm::Value *EmitAArch64SMEBuiltinExpr(unsigned BuiltinID, const CallExpr *E); + llvm::Value *EmitSMELoadStore(const SVETypeFlags &TypeFlags, + unsigned BuiltinID, + SmallVectorImpl &Ops); llvm::Value *EmitAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E, llvm::Triple::ArchType Arch); diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -2051,6 +2051,16 @@ return; } + // We only need to handle the 'arm_locally_streaming' attribute as a + // special case here (as opposed to e.g. 'arm_streaming'), because it + // is not set from the prototype, but rather from the definition. + // The same holds for 'arm_new_za'. + if (D->hasAttr()) + B.addAttribute("aarch64_pstate_sm_body"); + + if (D->hasAttr()) + B.addAttribute("aarch64_pstate_za_new"); + // Track whether we need to add the optnone LLVM attribute, // starting with the default for this optimization level. bool ShouldAddOptNone = diff --git a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp --- a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp +++ b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp @@ -101,6 +101,14 @@ Features.push_back("-sve2-sm4"); } + if (Feature == "sme-i16i64" || Feature == "sme-f64f64" || Feature == "sme2") + Features.push_back("+sme"); + else if (Feature == "nosme") { + Features.push_back("-sme-i16i64"); + Features.push_back("-sme-f64f64"); + Features.push_back("-sme2"); + } + // +sve implies +f32mm if the base architecture is >= v8.6A (except v9A) // It isn't the case in general that sve implies both f64mm and f32mm if ((ArchInfo == llvm::AArch64::ARMV8_6A || diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -340,6 +340,8 @@ clang_generate_header(-gen-arm-fp16 arm_fp16.td arm_fp16.h) # Generate arm_sve.h clang_generate_header(-gen-arm-sve-header arm_sve.td arm_sve.h) + # Generate arm_sme.h + clang_generate_header(-gen-arm-sme-header arm_sme.td arm_sme.h) # Generate arm_bf16.h clang_generate_header(-gen-arm-bf16 arm_bf16.td arm_bf16.h) # Generate arm_mve.h @@ -360,6 +362,7 @@ list(APPEND aarch64_only_generated_files "${CMAKE_CURRENT_BINARY_DIR}/arm_sve.h" + "${CMAKE_CURRENT_BINARY_DIR}/arm_sme.h" "${CMAKE_CURRENT_BINARY_DIR}/arm_bf16.h" ) endif() diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp --- a/clang/lib/Sema/Sema.cpp +++ b/clang/lib/Sema/Sema.cpp @@ -2057,7 +2057,8 @@ llvm::StringMap CallerFeatureMap; Context.getFunctionFeatureMap(CallerFeatureMap, FD); if (!Builtin::evaluateRequiredTargetFeatures( - "sve", CallerFeatureMap)) + "sve", CallerFeatureMap) && !Builtin::evaluateRequiredTargetFeatures( + "sme", CallerFeatureMap)) Diag(D->getLocation(), diag::err_sve_vector_in_non_sve_target) << Ty; } }; diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -2866,18 +2866,8 @@ llvm_unreachable("Invalid NeonTypeFlag!"); } -bool Sema::CheckSVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { - // Range check SVE intrinsics that take immediate values. - SmallVector, 3> ImmChecks; - - switch (BuiltinID) { - default: - return false; -#define GET_SVE_IMMEDIATE_CHECK -#include "clang/Basic/arm_sve_sema_rangechecks.inc" -#undef GET_SVE_IMMEDIATE_CHECK - } - +bool Sema::ParseSVEImmChecks( + CallExpr *TheCall, SmallVector, 3> &ImmChecks) { // Perform all the immediate checks for this builtin call. bool HasError = false; for (auto &I : ImmChecks) { @@ -2981,14 +2971,203 @@ if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 0, 3)) HasError = true; break; + case SVETypeFlags::ImmCheck0: + if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 0, 0)) + HasError = true; + break; + case SVETypeFlags::ImmCheck0_15: + if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 0, 15)) + HasError = true; + break; + case SVETypeFlags::ImmCheck0_255: + if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 0, 255)) + HasError = true; + break; + case SVETypeFlags::ImmCheck0_2_Mul2: + if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 0, 2) || + SemaBuiltinConstantArgMultiple(TheCall, ArgNum, 2)) + HasError = true; + break; + case SVETypeFlags::ImmCheck0_6_Mul2: + if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 0, 6) || + SemaBuiltinConstantArgMultiple(TheCall, ArgNum, 2)) + HasError = true; + break; + case SVETypeFlags::ImmCheck0_14_Mul2: + if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 0, 14) || + SemaBuiltinConstantArgMultiple(TheCall, ArgNum, 2)) + HasError = true; + break; + case SVETypeFlags::ImmCheck0_4_Mul4: + if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 0, 4) || + SemaBuiltinConstantArgMultiple(TheCall, ArgNum, 4)) + HasError = true; + break; + case SVETypeFlags::ImmCheck0_12_Mul4: + if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 0, 12) || + SemaBuiltinConstantArgMultiple(TheCall, ArgNum, 4)) + HasError = true; + break; + case SVETypeFlags::ImmCheck2_4_Mul2: + if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 2, 4) || + SemaBuiltinConstantArgMultiple(TheCall, ArgNum, 2)) + HasError = true; + break; + case SVETypeFlags::ImmCheck0_56_Mul8: + if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 0, 56) || + SemaBuiltinConstantArgMultiple(TheCall, ArgNum, 8)) + HasError = true; + break; } } - return HasError; } +enum ArmStreamingType { + ArmNonStreaming, + ArmStreaming, + ArmStreamingCompatible, + ArmLocallyStreaming +}; + +static ArmStreamingType getArmStreamingFnType(const FunctionDecl *FD) { + if (FD->hasAttr()) + return ArmLocallyStreaming; + if (const auto *T = dyn_cast(FD->getType())) { + if (T->getAArch64SMEAttributes() & FunctionType::SME_PStateSMEnabledMask) + return ArmStreaming; + if (T->getAArch64SMEAttributes() & FunctionType::SME_PStateSMCompatibleMask) + return ArmStreamingCompatible; + } + return ArmNonStreaming; +} + +static void checkArmStreamingBuiltin(Sema &S, CallExpr *TheCall, + ArmStreamingType FnType, + ArmStreamingType BuiltinType) { + assert(BuiltinType != ArmLocallyStreaming && + "Unexpected locally_streaming attribute for builtin!"); + if ((FnType == ArmStreaming || FnType == ArmLocallyStreaming) && + BuiltinType == ArmNonStreaming) + S.Diag(TheCall->getBeginLoc(), diag::warn_attribute_arm_sm_incompat_builtin) + << TheCall->getSourceRange() << "streaming or locally streaming"; + + if ((FnType == ArmStreamingCompatible) && + BuiltinType != ArmStreamingCompatible) { + S.Diag(TheCall->getBeginLoc(), diag::warn_attribute_arm_sm_incompat_builtin) + << TheCall->getSourceRange() << "streaming compatible"; + return; + } + + if (FnType == ArmNonStreaming && BuiltinType == ArmStreaming) + S.Diag(TheCall->getBeginLoc(), diag::warn_attribute_arm_sm_incompat_builtin) + << TheCall->getSourceRange() << "non-streaming"; +} + +static bool hasSMEZAState(const FunctionDecl *FD) { + if (FD->hasAttr()) + return true; + if (const auto *T = dyn_cast(FD->getType())) + if (T->getAArch64SMEAttributes() & FunctionType::SME_PStateZASharedMask) + return true; + return false; +} + +static bool hasSMEZAState(unsigned BuiltinID) { + switch (BuiltinID) { + default: + return false; +#define GET_SME_BUILTIN_HAS_ZA_STATE +#include "clang/Basic/arm_sme_builtins_za_state.inc" +#undef GET_SME_BUILTIN_HAS_ZA_STATE + } +} + +bool Sema::CheckSMEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { + if (const FunctionDecl *FD = getCurFunctionDecl()) { + ArmStreamingType FnType = getArmStreamingFnType(FD); + std::optional BuiltinType; + + switch (BuiltinID) { + default: + break; +#define GET_SME_STREAMING_ATTRS +#include "clang/Basic/arm_sme_streaming_attrs.inc" +#undef GET_SME_STREAMING_ATTRS + } + + if (BuiltinType) + checkArmStreamingBuiltin(*this, TheCall, FnType, *BuiltinType); + + if (hasSMEZAState(BuiltinID) && !hasSMEZAState(FD)) + Diag(TheCall->getBeginLoc(), + diag::warn_attribute_arm_za_builtin_no_za_state) + << TheCall->getSourceRange(); + } + + // Range check SME intrinsics that take immediate values. + SmallVector, 3> ImmChecks; + + switch (BuiltinID) { + default: + return false; +#define GET_SME_IMMEDIATE_CHECK +#include "clang/Basic/arm_sme_sema_rangechecks.inc" +#undef GET_SME_IMMEDIATE_CHECK + } + + return ParseSVEImmChecks(TheCall, ImmChecks); +} + +bool Sema::CheckSVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { + if (const FunctionDecl *FD = getCurFunctionDecl()) { + ArmStreamingType SmType = getArmStreamingFnType(FD); + std::optional BuiltinType; + + switch (BuiltinID) { + default: + break; +#define GET_SVE_STREAMING_ATTRS +#include "clang/Basic/arm_sve_streaming_attrs.inc" +#undef GET_SVE_STREAMING_ATTRS + } + + if (BuiltinType) + checkArmStreamingBuiltin(*this, TheCall, SmType, *BuiltinType); + } + + // Range check SVE intrinsics that take immediate values. + SmallVector, 3> ImmChecks; + + switch (BuiltinID) { + default: + return false; +#define GET_SVE_IMMEDIATE_CHECK +#include "clang/Basic/arm_sve_sema_rangechecks.inc" +#undef GET_SVE_IMMEDIATE_CHECK + } + + return ParseSVEImmChecks(TheCall, ImmChecks); +} + bool Sema::CheckNeonBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, CallExpr *TheCall) { + if (const FunctionDecl *FD = getCurFunctionDecl()) { + ArmStreamingType SmType = getArmStreamingFnType(FD); + std::optional BuiltinType; + + switch (BuiltinID) { + default: + break; +#define GET_NEON_STREAMING_COMPAT_FLAG +#include "clang/Basic/arm_neon.inc" +#undef GET_NEON_STREAMING_COMPAT_FLAG + } + + if (BuiltinType) + checkArmStreamingBuiltin(*this, TheCall, SmType, *BuiltinType); + } + llvm::APSInt Result; uint64_t mask = 0; unsigned TV = 0; @@ -3351,6 +3530,9 @@ if (CheckSVEBuiltinFunctionCall(BuiltinID, TheCall)) return true; + if (CheckSMEBuiltinFunctionCall(BuiltinID, TheCall)) + return true; + // For intrinsics which take an immediate value as part of the instruction, // range check them here. unsigned i = 0, l = 0, u = 0; diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -3751,6 +3751,15 @@ } } + // It is not allowed to redeclare an SME function with different SME + // attributes. + if (IsInvalidSMECallConversion(Old->getType(), New->getType())) { + Diag(New->getLocation(), diag::err_sme_attr_mismatch) + << New->getType() << Old->getType(); + Diag(OldLocation, diag::note_previous_declaration); + return true; + } + // If a function is first declared with a calling convention, but is later // declared or defined without one, all following decls assume the calling // convention of the first. diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -275,6 +275,7 @@ if (const auto *A = D->getAttr()) { S.Diag(AL.getLoc(), diag::err_attributes_are_not_compatible) << AL << A; S.Diag(A->getLocation(), diag::note_conflicting_attribute); + AL.setInvalid(); return true; } return false; @@ -5702,6 +5703,14 @@ BuiltinID <= AArch64::LastSVEBuiltin; } +static bool ArmSmeAliasValid(ASTContext &Context, unsigned BuiltinID, + StringRef AliasName) { + if (Context.BuiltinInfo.isAuxBuiltinID(BuiltinID)) + BuiltinID = Context.BuiltinInfo.getAuxBuiltinID(BuiltinID); + return BuiltinID >= AArch64::FirstSMEBuiltin && + BuiltinID <= AArch64::LastSMEBuiltin; +} + static void handleArmBuiltinAliasAttr(Sema &S, Decl *D, const ParsedAttr &AL) { if (!AL.isArgIdent(0)) { S.Diag(AL.getLoc(), diag::err_attribute_argument_n_type) @@ -5714,7 +5723,8 @@ StringRef AliasName = cast(D)->getIdentifier()->getName(); bool IsAArch64 = S.Context.getTargetInfo().getTriple().isAArch64(); - if ((IsAArch64 && !ArmSveAliasValid(S.Context, BuiltinID, AliasName)) || + if ((IsAArch64 && !ArmSveAliasValid(S.Context, BuiltinID, AliasName) && + !ArmSmeAliasValid(S.Context, BuiltinID, AliasName)) || (!IsAArch64 && !ArmMveAliasValid(BuiltinID, AliasName) && !ArmCdeAliasValid(BuiltinID, AliasName))) { S.Diag(AL.getLoc(), diag::err_attribute_arm_builtin_alias); @@ -9326,6 +9336,30 @@ handleArmBuiltinAliasAttr(S, D, AL); break; + case ParsedAttr::AT_ArmLocallyStreaming: + handleSimpleAttribute(S, D, AL); + break; + + case ParsedAttr::AT_ArmNewZA: + if (auto *FPT = dyn_cast(D->getFunctionType())) { + if (FPT->getAArch64SMEAttributes() & + FunctionType::SME_PStateZASharedMask) { + S.Diag(AL.getLoc(), diag::err_attributes_are_not_compatible) + << AL << "'arm_shared_za'"; + AL.setInvalid(); + } + if (FPT->getAArch64SMEAttributes() & + FunctionType::SME_PStateZAPreservedMask) { + S.Diag(AL.getLoc(), diag::err_attributes_are_not_compatible) + << AL << "'arm_preserves_za'"; + AL.setInvalid(); + } + if (AL.isInvalid()) + return; + } + handleSimpleAttribute(S, D, AL); + break; + case ParsedAttr::AT_AcquireHandle: handleAcquireHandleAttr(S, D, AL); break; diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -17683,6 +17683,14 @@ } } + // SME attributes must match when overriding a function declaration. + if (IsInvalidSMECallConversion(Old->getType(), New->getType())) { + Diag(New->getLocation(), diag::err_conflicting_overriding_attributes) + << New << New->getType() << Old->getType(); + Diag(Old->getLocation(), diag::note_overridden_virtual_function); + return true; + } + // Virtual overrides must have the same code_seg. const auto *OldCSA = Old->getAttr(); const auto *NewCSA = New->getAttr(); diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -9516,6 +9516,30 @@ ColonLoc, result, VK, OK); } +// Check that the SME attributes for PSTATE.ZA and PSTATE.SM are compatible. +bool Sema::IsInvalidSMECallConversion(QualType FromType, QualType ToType) { + unsigned FromAttributes = 0, ToAttributes = 0; + if (const auto *FromFn = + dyn_cast(Context.getCanonicalType(FromType))) + FromAttributes = + FromFn->getAArch64SMEAttributes() & FunctionType::SME_AttributeMask; + if (const auto *ToFn = + dyn_cast(Context.getCanonicalType(ToType))) + ToAttributes = + ToFn->getAArch64SMEAttributes() & FunctionType::SME_AttributeMask; + + if (FromAttributes == ToAttributes) + return false; + + // Make an exception for preserves_za, which can be dropped because it's + // only a hint. + unsigned Changed = FromAttributes ^ ToAttributes; + bool DropsPreservesZA = + Changed == FunctionType::SME_PStateZAPreservedMask && + (ToAttributes & FunctionType::SME_PStateZAPreservedMask); + return !DropsPreservesZA; +} + // Check if we have a conversion between incompatible cmse function pointer // types, that is, a conversion between a function pointer with the // cmse_nonsecure_call attribute and one without. @@ -9682,6 +9706,8 @@ return Sema::IncompatibleFunctionPointer; if (IsInvalidCmseNSCallConversion(S, ltrans, rtrans)) return Sema::IncompatibleFunctionPointer; + if (S.IsInvalidSMECallConversion(ltrans, rtrans)) + return Sema::IncompatibleFunctionPointer; return ConvTy; } diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -1686,6 +1686,26 @@ Changed = true; } + // Drop the 'arm_preserves_za' if not present in the target type (we can do + // that because it is merely a hint). + if (const auto *FromFPT = dyn_cast(FromFn)) { + FunctionProtoType::ExtProtoInfo ExtInfo = FromFPT->getExtProtoInfo(); + if (ExtInfo.AArch64SMEAttributes & + FunctionType::SME_PStateZAPreservedMask) { + unsigned ToFlags = 0; + if (const auto *ToFPT = dyn_cast(ToFn)) + ToFlags = ToFPT->getExtProtoInfo().AArch64SMEAttributes; + if (!(ToFlags & FunctionType::SME_PStateZAPreservedMask)) { + ExtInfo.setArmSMEAttribute(FunctionType::SME_PStateZAPreservedMask, + false); + QualType QT = Context.getFunctionType( + FromFPT->getReturnType(), FromFPT->getParamTypes(), ExtInfo); + FromFn = QT->getAs(); + Changed = true; + } + } + } + // Drop 'noexcept' if not present in target type. if (const auto *FromFPT = dyn_cast(FromFn)) { const auto *ToFPT = cast(ToFn); diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -140,6 +140,10 @@ case ParsedAttr::AT_NoReturn: \ case ParsedAttr::AT_Regparm: \ case ParsedAttr::AT_CmseNSCall: \ + case ParsedAttr::AT_ArmStreaming: \ + case ParsedAttr::AT_ArmStreamingCompatible: \ + case ParsedAttr::AT_ArmSharedZA: \ + case ParsedAttr::AT_ArmPreservesZA: \ case ParsedAttr::AT_AnyX86NoCallerSavedRegisters: \ case ParsedAttr::AT_AnyX86NoCfCheck: \ CALLING_CONV_ATTRS_CASELIST @@ -7740,6 +7744,24 @@ llvm_unreachable("unexpected attribute kind!"); } +static bool checkMutualExclusion(TypeProcessingState &state, + const FunctionProtoType::ExtProtoInfo &EPI, + ParsedAttr &Attr, + AttributeCommonInfo::Kind OtherKind) { + auto OtherAttr = std::find_if( + state.getCurrentAttributes().begin(), state.getCurrentAttributes().end(), + [OtherKind](const ParsedAttr &A) { return A.getKind() == OtherKind; }); + if (OtherAttr == state.getCurrentAttributes().end() || OtherAttr->isInvalid()) + return false; + + Sema &S = state.getSema(); + S.Diag(Attr.getLoc(), diag::err_attributes_are_not_compatible) + << *OtherAttr << Attr; + S.Diag(OtherAttr->getLoc(), diag::note_conflicting_attribute); + Attr.setInvalid(); + return true; +} + /// Process an individual function attribute. Returns true to /// indicate that the attribute was handled, false if it wasn't. static bool handleFunctionTypeAttr(TypeProcessingState &state, ParsedAttr &attr, @@ -7869,6 +7891,54 @@ return true; } + if (attr.getKind() == ParsedAttr::AT_ArmStreaming || + attr.getKind() == ParsedAttr::AT_ArmStreamingCompatible || + attr.getKind() == ParsedAttr::AT_ArmSharedZA || + attr.getKind() == ParsedAttr::AT_ArmPreservesZA){ + if (S.CheckAttrTarget(attr) || S.CheckAttrNoArgs(attr)) + return true; + + if (!unwrapped.isFunctionType()) + return false; + + const FunctionProtoType *FnTy = unwrapped.get()->getAs(); + if (!FnTy) { + // SME ACLE attributes are not supported on K&R-style unprototyped C + // functions. + S.Diag(attr.getLoc(), diag::warn_attribute_ignored) << attr; + attr.setInvalid(); + return false; + } + + FunctionProtoType::ExtProtoInfo EPI = FnTy->getExtProtoInfo(); + switch (attr.getKind()) { + case ParsedAttr::AT_ArmStreaming: + if (checkMutualExclusion(state, EPI, attr, + ParsedAttr::AT_ArmStreamingCompatible)) + return true; + EPI.setArmSMEAttribute(FunctionType::SME_PStateSMEnabledMask); + break; + case ParsedAttr::AT_ArmStreamingCompatible: + if (checkMutualExclusion(state, EPI, attr, ParsedAttr::AT_ArmStreaming)) + return true; + EPI.setArmSMEAttribute(FunctionType::SME_PStateSMCompatibleMask); + break; + case ParsedAttr::AT_ArmSharedZA: + EPI.setArmSMEAttribute(FunctionType::SME_PStateZASharedMask); + break; + case ParsedAttr::AT_ArmPreservesZA: + EPI.setArmSMEAttribute(FunctionType::SME_PStateZAPreservedMask); + break; + default: + llvm_unreachable("Unsupported attribute"); + } + + QualType newtype = S.Context.getFunctionType(FnTy->getReturnType(), + FnTy->getParamTypes(), EPI); + type = unwrapped.wrap(S, newtype->getAs()); + return true; + } + if (attr.getKind() == ParsedAttr::AT_NoThrow) { // Delay if this is not a function type. if (!unwrapped.isFunctionType()) diff --git a/clang/test/AST/ast-dump-sme-attributes.cpp b/clang/test/AST/ast-dump-sme-attributes.cpp new file mode 100644 --- /dev/null +++ b/clang/test/AST/ast-dump-sme-attributes.cpp @@ -0,0 +1,66 @@ +// Test without serialization: +// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -std=c++2a -ast-dump -ast-dump-filter Foo %s | FileCheck -strict-whitespace %s + +// Test with serialization: +// RUN: %clang_cc1 -std=c++20 -triple aarch64 -target-feature +sme -emit-pch -o %t %s +// RUN: %clang_cc1 -x c++ -std=c++20 -triple aarch64 -target-feature +sme -include-pch %t -ast-dump-all -ast-dump-filter Foo /dev/null \ +// RUN: | sed -e "s/ //" -e "s/ imported//" \ +// RUN: | FileCheck --strict-whitespace %s + +struct Foo { +// CHECK: |-CXXRecordDecl {{.*}} implicit struct Foo +// CHECK-NEXT: |-CXXMethodDecl {{.*}} f_streaming 'void () __attribute__((arm_streaming))' +// CHECK-NEXT: |-CXXMethodDecl {{.*}} f_streaming_compatible 'void () __attribute__((arm_streaming_compatible))' +// CHECK-NEXT: |-CXXMethodDecl {{.*}} f_locally_streaming 'void ()' +// CHECK-NEXT: | `-ArmLocallyStreamingAttr +// CHECK-NEXT: |-CXXMethodDecl {{.*}} f_shared_za 'void () __attribute__((arm_shared_za))' +// CHECK-NEXT: |-CXXMethodDecl {{.*}} f_new_za 'void ()' +// CHECK-NEXT: | `-ArmNewZAAttr +// CHECK-NEXT: |-CXXMethodDecl {{.*}} f_preserves_za 'void () __attribute__((arm_preserves_za))' + void f_streaming() __attribute__((arm_streaming)); + void f_streaming_compatible() __attribute__((arm_streaming_compatible)); + void f_locally_streaming() __attribute__((arm_locally_streaming)); + void f_shared_za() __attribute__((arm_shared_za)); + void f_new_za() __attribute__((arm_new_za)); + void f_preserves_za() __attribute__((arm_preserves_za)); + + +// CHECK: |-CXXMethodDecl {{.*}} test_lambda 'int (int)' implicit-inline +// CHECK: `-CompoundStmt +// CHECK-NEXT: |-DeclStmt +// CHECK-NEXT: | `-VarDecl +// CHECK-NEXT: | `-LambdaExpr +// CHECK-NEXT: | |-CXXRecordDecl +// CHECK: | | |-CXXMethodDecl {{.*}} used constexpr operator() 'int (int) __attribute__((arm_streaming)) const' inline +// CHECK: | | |-CXXConversionDecl {{.*}} implicit constexpr operator int (*)(int) __attribute__((arm_streaming)) 'int (*() const noexcept)(int) __attribute__((arm_streaming))' inline +// CHECK: | | |-CXXMethodDecl {{.*}} implicit __invoke 'int (int) __attribute__((arm_streaming))' static inline +// CHECK: `-ReturnStmt +// CHECK: `-CXXOperatorCallExpr +// CHECK-NEXT: |-ImplicitCastExpr {{.*}} 'int (*)(int) __attribute__((arm_streaming)) const' +// CHECK-NEXT: | `-DeclRefExpr {{.*}} 'int (int) __attribute__((arm_streaming)) const' lvalue CXXMethod {{.*}} 'operator()' 'int (int) __attribute__((arm_streaming)) const' + int test_lambda(int x) { + auto F = [](int x) __attribute__((arm_streaming)) { return x; }; + return F(x); + } + +// CHECK: |-TypedefDecl {{.*}} referenced s_ptrty 'void (*)(int, int) __attribute__((arm_streaming))' +// CHECK-NEXT: | `-PointerType {{.*}} 'void (*)(int, int) __attribute__((arm_streaming))' +// CHECK-NEXT: | `-ParenType {{.*}} 'void (int, int) __attribute__((arm_streaming))' sugar +// CHECK-NEXT: | `-FunctionProtoType {{.*}} 'void (int, int) __attribute__((arm_streaming))' cdecl + typedef void __attribute__((arm_streaming)) (*s_ptrty) (int, int); + +// CHECK: `-CXXMethodDecl {{.*}} test_streaming_ptrty 'void (s_ptrty, int, int)' implicit-inline +// CHECK-NEXT: |-ParmVarDecl {{.*}} used f 's_ptrty':'void (*)(int, int) __attribute__((arm_streaming))' +// CHECK-NEXT: |-ParmVarDecl {{.*}} used x 'int' +// CHECK-NEXT: |-ParmVarDecl {{.*}} used y 'int' +// CHECK: `-CompoundStmt +// CHECK-NEXT: `-ReturnStmt +// CHECK-NEXT: `-CallExpr +// CHECK-NEXT: |-ImplicitCastExpr {{.*}} 's_ptrty':'void (*)(int, int) __attribute__((arm_streaming))' +// CHECK-NEXT: | `-DeclRefExpr {{.*}} 's_ptrty':'void (*)(int, int) __attribute__((arm_streaming))' lvalue ParmVar {{.*}} 'f' 's_ptrty':'void (*)(int, int) __attribute__((arm_streaming))' +// CHECK-NEXT: |-ImplicitCastExpr {{.*}} 'int' +// CHECK-NEXT: | `-DeclRefExpr {{.*}} 'int' lvalue ParmVar {{.*}} 'x' 'int' +// CHECK-NEXT: `-ImplicitCastExpr {{.*}} 'int' +// CHECK-NEXT: `-DeclRefExpr {{.*}} 'int' lvalue ParmVar {{.*}} 'y' 'int' + void test_streaming_ptrty(s_ptrty f, int x, int y) { return f(x, y); }; +}; diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp @@ -0,0 +1,274 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme \ +// RUN: -S -O0 -Werror -emit-llvm -o - %s | FileCheck %s + +extern "C" { + +extern int normal_callee(); + +// == FUNCTION DECLARATIONS == + +__attribute__((arm_streaming)) int streaming_decl(); +__attribute__((arm_streaming_compatible)) int streaming_compatible_decl(); +__attribute__((arm_shared_za)) int shared_za_decl(); +__attribute__((arm_preserves_za)) int preserves_za_decl(); +__attribute__((arm_new_za)) int new_za_decl(); + +// == FUNCTION DEFINITIONS == + +// CHECK-LABEL: @streaming_caller() +// CHECK-SAME: #[[SM_ENABLED:[0-9]+]] +// CHECK: call i32 @normal_callee() +// +__attribute__((arm_streaming)) int streaming_caller() { + return normal_callee(); +} + +// CHECK: declare i32 @normal_callee() #[[NORMAL_DECL:[0-9]+]] + + +// CHECK-LABEL: @streaming_callee() +// CHECK-SAME: #[[SM_ENABLED]] +// CHECK: call i32 @streaming_decl() #[[SM_ENABLED_CALL:[0-9]+]] +// +__attribute__((arm_streaming)) int streaming_callee() { + return streaming_decl(); +} + +// CHECK: declare i32 @streaming_decl() #[[SM_ENABLED_DECL:[0-9]+]] + +// CHECK-LABEL: @streaming_compatible_caller() +// CHECK-SAME: #[[SM_COMPATIBLE:[0-9]+]] +// CHECK: call i32 @normal_callee() +// +__attribute__((arm_streaming_compatible)) int streaming_compatible_caller() { + return normal_callee(); +} + +// CHECK-LABEL: @streaming_compatible_callee() +// CHECK-SAME: #[[SM_COMPATIBLE]] +// CHECK: call i32 @streaming_compatible_decl() #[[SM_COMPATIBLE_CALL:[0-9]+]] +// +__attribute__((arm_streaming_compatible)) int streaming_compatible_callee() { + return streaming_compatible_decl(); +} + +// CHECK: declare i32 @streaming_compatible_decl() #[[SM_COMPATIBLE_DECL:[0-9]+]] + +// CHECK-LABEL: @locally_streaming_caller() +// CHECK-SAME: #[[SM_BODY:[0-9]+]] +// CHECK: call i32 @normal_callee() +// +__attribute__((arm_locally_streaming)) int locally_streaming_caller() { + return normal_callee(); +} + +// CHECK-LABEL: @locally_streaming_callee() +// CHECK-SAME: #[[SM_BODY]] +// CHECK: call i32 @locally_streaming_caller() #[[SM_BODY_CALL:[0-9]+]] +// +__attribute__((arm_locally_streaming)) int locally_streaming_callee() { + return locally_streaming_caller(); +} + + +// CHECK-LABEL: @shared_za_caller() +// CHECK-SAME: #[[ZA_SHARED:[0-9]+]] +// CHECK: call i32 @normal_callee() +// +__attribute__((arm_shared_za)) int shared_za_caller() { + return normal_callee(); +} + +// CHECK-LABEL: @shared_za_callee() +// CHECK-SAME: #[[ZA_SHARED]] +// CHECK: call i32 @shared_za_decl() #[[ZA_SHARED_CALL:[0-9]+]] +// +__attribute__((arm_shared_za)) int shared_za_callee() { + return shared_za_decl(); +} + +// CHECK: declare i32 @shared_za_decl() #[[ZA_SHARED_DECL:[0-9]+]] + + +// CHECK-LABEL: @preserves_za_caller() +// CHECK-SAME: #[[ZA_PRESERVED:[0-9]+]] +// CHECK: call i32 @normal_callee() +// +__attribute__((arm_preserves_za)) int preserves_za_caller() { + return normal_callee(); +} + +// CHECK-LABEL: @preserves_za_callee() +// CHECK-SAME: #[[ZA_PRESERVED]] +// CHECK: call i32 @preserves_za_decl() #[[ZA_PRESERVED_CALL:[0-9]+]] +// +__attribute__((arm_preserves_za)) int preserves_za_callee() { + return preserves_za_decl(); +} + +// CHECK: declare i32 @preserves_za_decl() #[[ZA_PRESERVED_DECL:[0-9]+]] + + +// CHECK-LABEL: @new_za_caller() +// CHECK-SAME: #[[ZA_NEW:[0-9]+]] +// CHECK: call i32 @normal_callee() +// +__attribute__((arm_new_za)) int new_za_caller() { + return normal_callee(); +} + +// CHECK-LABEL: @new_za_callee() +// CHECK-SAME: #[[ZA_NEW]] +// CHECK: call i32 @new_za_decl() #[[ZA_NEW_CALL:[0-9]+]] +// +__attribute__((arm_new_za)) int new_za_callee() { + return new_za_decl(); +} + +// CHECK: declare i32 @new_za_decl() #[[ZA_NEW_DECL:[0-9]+]] + + +// Ensure that the attributes are correctly propagated to function types +// and also to callsites. +typedef void __attribute__((arm_streaming)) (*s_ptrty) (int, int); +typedef void __attribute__((arm_streaming_compatible)) (*sc_ptrty) (int, int); +typedef void __attribute__((arm_shared_za)) (*sz_ptrty) (int, int); +typedef void __attribute__((arm_preserves_za)) (*pz_ptrty) (int, int); + +// CHECK-LABEL: @test_streaming_ptrty( +// CHECK-SAME: #[[NORMAL_DEF:[0-9]+]] +// CHECK: call void [[F:%.*]](i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[SM_ENABLED_CALL]] +// +void test_streaming_ptrty(s_ptrty f, int x, int y) { return f(x, y); } +// CHECK-LABEL: @test_streaming_compatible_ptrty( +// CHECK-SAME: #[[NORMAL_DEF]] +// CHECK: call void [[F:%.*]](i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[SM_COMPATIBLE_CALL]] +// +void test_streaming_compatible_ptrty(sc_ptrty f, int x, int y) { return f(x, y); } +// CHECK-LABEL: @test_shared_za( +// CHECK-SAME: #[[ZA_SHARED]] +// CHECK: call void [[F:%.*]](i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ZA_SHARED_CALL]] +// +void __attribute__((arm_shared_za)) test_shared_za(sz_ptrty f, int x, int y) { return f(x, y); } +// CHECK-LABEL: @test_preserved_za( +// CHECK-SAME: #[[ZA_SHARED]] +// CHECK: call void [[F:%.*]](i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ZA_PRESERVED_CALL]] +// +void __attribute__((arm_shared_za)) test_preserved_za(pz_ptrty f, int x, int y) { return f(x, y); } + +// CHECK-LABEL: @test_indirect_streaming_ptrty( +// CHECK-SAME: #[[NORMAL_DEF:[0-9]+]] +// CHECK: call void [[F:%.*]](i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[SM_ENABLED_CALL]] +// +typedef s_ptrty **indirect_s_ptrty; +void test_indirect_streaming_ptrty(indirect_s_ptrty fptr, int x, int y) { return (**fptr)(x, y); } +} // extern "C" + +// +// Test that having the attribute in different places (on declaration and on type) +// both results in the attribute being applied to the type. +// + +// CHECK-LABEL: @_Z24test_same_type_streamingv( +// CHECK: call void @_Z10streaming1v() #[[SM_ENABLED_CALL]] +// CHECK: call void @_Z10streaming2v() #[[SM_ENABLED_CALL]] +// CHECK: call void @_Z10streaming3v() #[[SM_ENABLED_CALL]] +// CHECK: call void @_Z20same_type_streaming1v() #[[SM_ENABLED_CALL]] +// CHECK: call void @_Z20same_type_streaming2v() #[[SM_ENABLED_CALL]] +// CHECK: call void @_Z20same_type_streaming3v() #[[SM_ENABLED_CALL]] +// CHECK: ret void +// CHECK: } +// CHECK: declare void @_Z10streaming1v() #[[SM_ENABLED_DECL]] +// CHECK: declare void @_Z10streaming2v() #[[SM_ENABLED_DECL]] +// CHECK: declare void @_Z10streaming3v() #[[SM_ENABLED_DECL]] +// CHECK: declare void @_Z20same_type_streaming1v() #[[SM_ENABLED_DECL]] +// CHECK: declare void @_Z20same_type_streaming2v() #[[SM_ENABLED_DECL]] +// CHECK: declare void @_Z20same_type_streaming3v() #[[SM_ENABLED_DECL]] +__attribute__((arm_streaming)) void streaming1(); +void __attribute__((arm_streaming)) streaming2(); +void streaming3() __attribute__((arm_streaming)); +decltype(streaming1) same_type_streaming1; +decltype(streaming2) same_type_streaming2; +decltype(streaming3) same_type_streaming3; +void test_same_type_streaming() { + streaming1(); + streaming2(); + streaming3(); + same_type_streaming1(); + same_type_streaming2(); + same_type_streaming3(); +} + +// +// Test overloading; the attribute is not required for overloaded types and +// does not apply if not specified. +// + +// CHECK-LABEL: @_Z12overloadedfni( +// CHECK-SAME: #[[SM_ENABLED]] +int __attribute__((arm_streaming)) overloadedfn(int x) { return x; } +// CHECK-LABEL: @_Z12overloadedfnf( +// CHECK-SAME: #[[NORMAL_DEF]] +// +float overloadedfn(float x) { return x; } +// CHECK-LABEL: @_Z13test_overloadi( +// CHECK-SAME: #[[NORMAL_DEF]] +// +int test_overload(int x) { return overloadedfn(x); } +// CHECK-LABEL: @_Z13test_overloadf( +// CHECK-SAME: #[[NORMAL_DEF]] +// +float test_overload(float x) { return overloadedfn(x); } + +// CHECK-LABEL: @_Z11test_lambdai( +// CHECK-SAME: #[[NORMAL_DEF]] +// CHECK: call noundef i32 @"_ZZ11test_lambdaiENK3$_0clEi"({{.*}}) #[[SM_ENABLED_CALL]] +// +// CHECK: @"_ZZ11test_lambdaiENK3$_0clEi"( +// CHECK-SAME: #[[SM_ENABLED]] +int test_lambda(int x) { + auto F = [](int x) __attribute__((arm_streaming)) { return x; }; + return F(x); +} + +// CHECK-LABEL: @_Z27test_template_instantiationv( +// CHECK-SAME: #[[NORMAL_DEF]] +// CHECK: call noundef i32 @_Z15template_functyIiET_S0_(i32 noundef 12) #[[SM_ENABLED_CALL]] +// +// CHECK: @_Z15template_functyIiET_S0_( +// CHECK-SAME: #[[SM_ENABLED]] +template +Ty template_functy(Ty x) __attribute__((arm_streaming)) { return x; } +int test_template_instantiation() { return template_functy(12); } + +// +// Test that arm_locally_streaming is inherited by future redeclarations, +// even when they don't specify the attribute. +// + +// CHECK: define {{.*}} @_Z25locally_streaming_inheritv( +// CHECK-SAME: #[[SM_BODY]] +__attribute__((arm_locally_streaming)) void locally_streaming_inherit(); +void locally_streaming_inherit() { + streaming_decl(); +} + +// CHECK: attributes #[[SM_ENABLED]] = { mustprogress noinline nounwind optnone vscale_range(1,16) "aarch64_pstate_sm_enabled" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[NORMAL_DECL]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[SM_ENABLED_DECL]] = { "aarch64_pstate_sm_enabled" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[SM_COMPATIBLE]] = { mustprogress noinline nounwind optnone vscale_range(1,16) "aarch64_pstate_sm_compatible" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[SM_COMPATIBLE_DECL]] = { "aarch64_pstate_sm_compatible" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[SM_BODY]] = { mustprogress noinline nounwind optnone vscale_range(1,16) "aarch64_pstate_sm_body" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[ZA_SHARED]] = { mustprogress noinline nounwind optnone vscale_range(1,16) "aarch64_pstate_za_shared" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[ZA_SHARED_DECL]] = { "aarch64_pstate_za_shared" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[ZA_PRESERVED]] = { mustprogress noinline nounwind optnone vscale_range(1,16) "aarch64_pstate_za_preserved" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[ZA_PRESERVED_DECL]] = { "aarch64_pstate_za_preserved" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[ZA_NEW]] = { mustprogress noinline nounwind optnone vscale_range(1,16) "aarch64_pstate_za_new" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[ZA_NEW_DECL]] = { "aarch64_pstate_za_new" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[NORMAL_DEF]] = { mustprogress noinline nounwind optnone vscale_range(1,16) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[SM_ENABLED_CALL]] = { "aarch64_pstate_sm_enabled" } +// CHECK: attributes #[[SM_COMPATIBLE_CALL]] = { "aarch64_pstate_sm_compatible" } +// CHECK: attributes #[[SM_BODY_CALL]] = { "aarch64_pstate_sm_body" } +// CHECK: attributes #[[ZA_SHARED_CALL]] = { "aarch64_pstate_za_shared" } +// CHECK: attributes #[[ZA_PRESERVED_CALL]] = { "aarch64_pstate_za_preserved" } +// CHECK: attributes #[[ZA_NEW_CALL]] = { "aarch64_pstate_za_new" } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add.c @@ -0,0 +1,188 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// == HORIZONTAL == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svaddha_za32_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svaddha_za32_s32u10__SVBool_tu10__SVBool_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svaddha_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) { + SVE_ACLE_FUNC(svaddha_za32,_s32,_m,)(3, pn, pm, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svaddha_za32_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svaddha_za32_u32u10__SVBool_tu10__SVBool_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svaddha_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) { + SVE_ACLE_FUNC(svaddha_za32,_u32,_m,)(3, pn, pm, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svaddha_za64_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svaddha_za64_s64u10__SVBool_tu10__SVBool_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svaddha_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) { + SVE_ACLE_FUNC(svaddha_za64,_s64,_m,)(3, pn, pm, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svaddha_za64_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svaddha_za64_u64u10__SVBool_tu10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svaddha_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) { + SVE_ACLE_FUNC(svaddha_za64,_u64,_m,)(3, pn, pm, zn); +} + +// == VERTICAL == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svaddva_za32_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svaddva_za32_s32u10__SVBool_tu10__SVBool_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svaddva_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) { + SVE_ACLE_FUNC(svaddva_za32,_s32,_m,)(3, pn, pm, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svaddva_za32_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svaddva_za32_u32u10__SVBool_tu10__SVBool_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svaddva_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) { + SVE_ACLE_FUNC(svaddva_za32,_u32,_m,)(3, pn, pm, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svaddva_za64_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svaddva_za64_s64u10__SVBool_tu10__SVBool_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svaddva_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) { + SVE_ACLE_FUNC(svaddva_za64,_s64,_m,)(3, pn, pm, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svaddva_za64_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svaddva_za64_u64u10__SVBool_tu10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svaddva_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) { + SVE_ACLE_FUNC(svaddva_za64,_u64,_m,)(3, pn, pm, zn); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c @@ -0,0 +1,67 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +// CHECK-LABEL: @test_svcntsb( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: ret i64 [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z12test_svcntsbv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: ret i64 [[TMP0]] +// +__attribute__((arm_streaming_compatible)) +uint64_t test_svcntsb(void) { + return svcntsb(); +} + +// CHECK-LABEL: @test_svcntsh( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsh() +// CHECK-NEXT: ret i64 [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z12test_svcntshv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsh() +// CPP-CHECK-NEXT: ret i64 [[TMP0]] +// +__attribute__((arm_streaming_compatible)) +uint64_t test_svcntsh(void) { + return svcntsh(); +} + +// CHECK-LABEL: @test_svcntsw( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsw() +// CHECK-NEXT: ret i64 [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z12test_svcntswv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsw() +// CPP-CHECK-NEXT: ret i64 [[TMP0]] +// +__attribute__((arm_streaming_compatible)) +uint64_t test_svcntsw(void) { + return svcntsw(); +} + +// CHECK-LABEL: @test_svcntsd( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsd() +// CHECK-NEXT: ret i64 [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z12test_svcntsdv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsd() +// CPP-CHECK-NEXT: ret i64 [[TMP0]] +// +__attribute__((arm_streaming_compatible)) +uint64_t test_svcntsd(void) { + return svcntsd(); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_loads.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_loads.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_loads.c @@ -0,0 +1,499 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +// == HORIZONTAL == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_hor_za8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz( [[PG:%.*]], ptr [[BASE:%.*]], i32 0, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z18test_svld1_hor_za8ju10__SVBool_tPKa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz( [[PG:%.*]], ptr [[BASE:%.*]], i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_za8(uint32_t slice_base, svbool_t pg, const int8_t *base) { + svld1_hor_za8(0, slice_base, 15, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_hor_za16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 1, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svld1_hor_za16ju10__SVBool_tPKs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_za16(uint32_t slice_base, svbool_t pg, const int16_t *base) { + svld1_hor_za16(1, slice_base, 7, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_hor_za32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 3, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svld1_hor_za32ju10__SVBool_tPKi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 3, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_za32(uint32_t slice_base, svbool_t pg, const int32_t *base) { + svld1_hor_za32(3, slice_base, 3, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_hor_za64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 7, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svld1_hor_za64ju10__SVBool_tPKl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 7, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_za64(uint32_t slice_base, svbool_t pg, const int64_t *base) { + svld1_hor_za64(7, slice_base, 1, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_hor_za128( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz( [[TMP0]], ptr [[BASE:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svld1_hor_za128ju10__SVBool_tPKn( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz( [[TMP0]], ptr [[BASE:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_za128(uint32_t slice_base, svbool_t pg, const __int128_t *base) { + svld1_hor_za128(15, slice_base, 0, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_hor_vnum_za8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP2]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz( [[PG:%.*]], ptr [[TMP3]], i32 0, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svld1_hor_vnum_za8ju10__SVBool_tPKa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP2]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz( [[PG:%.*]], ptr [[TMP3]], i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, const int8_t *base) { + svld1_hor_vnum_za8(0, slice_base, 15, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_hor_vnum_za16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz( [[TMP1]], ptr [[TMP4]], i32 1, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svld1_hor_vnum_za16ju10__SVBool_tPKs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz( [[TMP1]], ptr [[TMP4]], i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, const int16_t *base) { + svld1_hor_vnum_za16(1, slice_base, 7, pg, base, 3); +} + +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_hor_vnum_za32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz( [[TMP1]], ptr [[TMP4]], i32 3, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svld1_hor_vnum_za32ju10__SVBool_tPKi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz( [[TMP1]], ptr [[TMP4]], i32 3, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, const int32_t *base) { + svld1_hor_vnum_za32(3, slice_base, 3, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_hor_vnum_za64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz( [[TMP1]], ptr [[TMP4]], i32 7, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svld1_hor_vnum_za64ju10__SVBool_tPKl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz( [[TMP1]], ptr [[TMP4]], i32 7, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, const int64_t *base) { + svld1_hor_vnum_za64(7, slice_base, 1, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_hor_vnum_za128( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP2]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz( [[TMP0]], ptr [[TMP3]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svld1_hor_vnum_za128ju10__SVBool_tPKn( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP2]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz( [[TMP0]], ptr [[TMP3]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, const __int128_t *base) { + svld1_hor_vnum_za128(15, slice_base, 0, pg, base, 3); +} + +// == VERTICAL == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_ver_za8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert( [[PG:%.*]], ptr [[BASE:%.*]], i32 0, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z18test_svld1_ver_za8ju10__SVBool_tPKa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert( [[PG:%.*]], ptr [[BASE:%.*]], i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_za8(uint32_t slice_base, svbool_t pg, const int8_t *base) { + svld1_ver_za8(0, slice_base, 15, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_ver_za16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert( [[TMP1]], ptr [[BASE:%.*]], i32 1, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svld1_ver_za16ju10__SVBool_tPKs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert( [[TMP1]], ptr [[BASE:%.*]], i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_za16(uint32_t slice_base, svbool_t pg, const int16_t *base) { + svld1_ver_za16(1, slice_base, 7, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_ver_za32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert( [[TMP1]], ptr [[BASE:%.*]], i32 3, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svld1_ver_za32ju10__SVBool_tPKi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert( [[TMP1]], ptr [[BASE:%.*]], i32 3, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_za32(uint32_t slice_base, svbool_t pg, const int32_t *base) { + svld1_ver_za32(3, slice_base, 3, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_ver_za64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert( [[TMP1]], ptr [[BASE:%.*]], i32 7, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svld1_ver_za64ju10__SVBool_tPKl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert( [[TMP1]], ptr [[BASE:%.*]], i32 7, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_za64(uint32_t slice_base, svbool_t pg, const int64_t *base) { + svld1_ver_za64(7, slice_base, 1, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_ver_za128( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert( [[TMP0]], ptr [[BASE:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svld1_ver_za128ju10__SVBool_tPKn( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert( [[TMP0]], ptr [[BASE:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_za128(uint32_t slice_base, svbool_t pg, const __int128_t *base) { + svld1_ver_za128(15, slice_base, 0, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_ver_vnum_za8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP2]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert( [[PG:%.*]], ptr [[TMP3]], i32 0, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svld1_ver_vnum_za8ju10__SVBool_tPKa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP2]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert( [[PG:%.*]], ptr [[TMP3]], i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_vnum_za8(uint32_t slice_base, svbool_t pg, const int8_t *base) { + svld1_ver_vnum_za8(0, slice_base, 15, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_ver_vnum_za16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert( [[TMP1]], ptr [[TMP4]], i32 1, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svld1_ver_vnum_za16ju10__SVBool_tPKs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert( [[TMP1]], ptr [[TMP4]], i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, const int16_t *base) { + svld1_ver_vnum_za16(1, slice_base, 7, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_ver_vnum_za32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert( [[TMP1]], ptr [[TMP4]], i32 3, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svld1_ver_vnum_za32ju10__SVBool_tPKi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert( [[TMP1]], ptr [[TMP4]], i32 3, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, const int32_t *base) { + svld1_ver_vnum_za32(3, slice_base, 3, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_ver_vnum_za64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert( [[TMP1]], ptr [[TMP4]], i32 7, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svld1_ver_vnum_za64ju10__SVBool_tPKl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert( [[TMP1]], ptr [[TMP4]], i32 7, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, const int64_t *base) { + svld1_ver_vnum_za64(7, slice_base, 1, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_ver_vnum_za128( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP2]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert( [[TMP0]], ptr [[TMP3]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svld1_ver_vnum_za128ju10__SVBool_tPKn( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP2]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert( [[TMP0]], ptr [[TMP3]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, const __int128_t *base) { + svld1_ver_vnum_za128(15, slice_base, 0, pg, base, 3); +} + +// +// +__attribute__((arm_streaming_compatible, arm_shared_za)) +// CHECK-LABEL: @test_svldr_vnum_za( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 15 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[TMP3]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z18test_svldr_vnum_zajPKh( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 15 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[TMP3]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svldr_vnum_za(uint32_t slice_base, const uint8_t *base) { + svldr_vnum_za(slice_base, 15, base); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mem_ops.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mem_ops.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mem_ops.c @@ -0,0 +1,67 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +// CHECK-LABEL: @test_arm_sc_memcpy( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memcpy(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR2:[0-9]+]] +// CHECK-NEXT: ret ptr [[CALL]] +// +// CPP-CHECK-LABEL: @_Z18test_arm_sc_memcpyPvPKvm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memcpy(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR2:[0-9]+]] +// CPP-CHECK-NEXT: ret ptr [[CALL]] +// +__attribute__((arm_streaming_compatible)) +void *test_arm_sc_memcpy(void *dest, const void *src, size_t n) { + return __arm_sc_memcpy(dest, src, n); +} + +// CHECK-LABEL: @test_arm_sc_memmove( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memmove(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR2]] +// CHECK-NEXT: ret ptr [[CALL]] +// +// CPP-CHECK-LABEL: @_Z19test_arm_sc_memmovePvPKvm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memmove(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR2]] +// CPP-CHECK-NEXT: ret ptr [[CALL]] +// +__attribute__((arm_streaming_compatible)) +void *test_arm_sc_memmove(void *dest, const void *src, size_t n) { + return __arm_sc_memmove(dest, src, n); +} + +// CHECK-LABEL: @test_arm_sc_memset( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memset(ptr noundef [[DEST:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR2]] +// CHECK-NEXT: ret ptr [[CALL]] +// +// CPP-CHECK-LABEL: @_Z18test_arm_sc_memsetPvim( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memset(ptr noundef [[DEST:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR2]] +// CPP-CHECK-NEXT: ret ptr [[CALL]] +// +__attribute__((arm_streaming_compatible)) +void *test_arm_sc_memset(void *dest, int c, size_t n) { + return __arm_sc_memset(dest, c, n); +} + +// CHECK-LABEL: @test_arm_sc_memchr( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memchr(ptr noundef [[DEST:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR2]] +// CHECK-NEXT: ret ptr [[CALL]] +// +// CPP-CHECK-LABEL: @_Z18test_arm_sc_memchrPvim( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memchr(ptr noundef [[DEST:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR2]] +// CPP-CHECK-NEXT: ret ptr [[CALL]] +// +__attribute__((arm_streaming_compatible)) +void *test_arm_sc_memchr(void *dest, int c, size_t n) { + return __arm_sc_memchr(dest, c, n); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mop.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mop.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mop.c @@ -0,0 +1,420 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -S -O1 -emit-llvm -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 \ +// RUN: -target-feature +sme-f64f64 -Werror -o - %s | FileCheck %s +// RUN: %clang_cc1 -S -O1 -emit-llvm -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 \ +// RUN: -target-feature +sme-f64f64 -Werror -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -S -O1 -emit-llvm -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 \ +// RUN: -DSVE_OVERLOADED_FORMS -target-feature +sme-f64f64 -Werror -o - %s | FileCheck %s +// RUN: %clang_cc1 -S -O1 -emit-llvm -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 \ +// RUN: -DSVE_OVERLOADED_FORMS -target-feature +sme-f64f64 -Werror -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// == MOPA / MOPS == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmopa_za32_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.wide.nxv8bf16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmopa_za32_bf16u10__SVBool_tu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.wide.nxv8bf16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmopa_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) { + SVE_ACLE_FUNC(svmopa_za32,_bf16,_m,)(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmopa_za32_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.wide.nxv8f16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmopa_za32_f16u10__SVBool_tu10__SVBool_tu13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.wide.nxv8f16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmopa_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) { + SVE_ACLE_FUNC(svmopa_za32,_f16,_m,)(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmopa_za32_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smopa.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svmopa_za32_s8u10__SVBool_tu10__SVBool_tu10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smopa.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) { + SVE_ACLE_FUNC(svmopa_za32,_s8,_m,)(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmopa_za32_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umopa.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svmopa_za32_u8u10__SVBool_tu10__SVBool_tu11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umopa.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) { + SVE_ACLE_FUNC(svmopa_za32,_u8,_m,)(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmopa_za64_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smopa.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmopa_za64_s16u10__SVBool_tu10__SVBool_tu11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smopa.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) { + SVE_ACLE_FUNC(svmopa_za64,_s16,_m,)(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmopa_za64_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umopa.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmopa_za64_u16u10__SVBool_tu10__SVBool_tu12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umopa.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) { + SVE_ACLE_FUNC(svmopa_za64,_u16,_m,)(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmops_za32_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.wide.nxv8bf16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmops_za32_bf16u10__SVBool_tu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.wide.nxv8bf16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmops_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) { + SVE_ACLE_FUNC(svmops_za32,_bf16,_m,)(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmops_za32_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.wide.nxv8f16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmops_za32_f16u10__SVBool_tu10__SVBool_tu13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.wide.nxv8f16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmops_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) { + SVE_ACLE_FUNC(svmops_za32,_f16,_m,)(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmops_za32_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smops.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svmops_za32_s8u10__SVBool_tu10__SVBool_tu10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smops.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) { + SVE_ACLE_FUNC(svmops_za32,_s8,_m,)(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmops_za32_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umops.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svmops_za32_u8u10__SVBool_tu10__SVBool_tu11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umops.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) { + SVE_ACLE_FUNC(svmops_za32,_u8,_m,)(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmops_za64_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smops.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmops_za64_s16u10__SVBool_tu10__SVBool_tu11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smops.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) { + SVE_ACLE_FUNC(svmops_za64,_s16,_m,)(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmops_za64_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umops.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmops_za64_u16u10__SVBool_tu10__SVBool_tu12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umops.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) { + SVE_ACLE_FUNC(svmops_za64,_u16,_m,)(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmopa_za32_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.nxv4f32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmopa_za32_f32u10__SVBool_tu10__SVBool_tu13__SVFloat32_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.nxv4f32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmopa_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) { + SVE_ACLE_FUNC(svmopa_za32,_f32,_m,)(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmopa_za64_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.nxv2f64(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmopa_za64_f64u10__SVBool_tu10__SVBool_tu13__SVFloat64_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.nxv2f64(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmopa_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) { + SVE_ACLE_FUNC(svmopa_za64,_f64,_m,)(7, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmops_za32_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.nxv4f32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmops_za32_f32u10__SVBool_tu10__SVBool_tu13__SVFloat32_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.nxv4f32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmops_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) { + SVE_ACLE_FUNC(svmops_za32,_f32,_m,)(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmops_za64_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.nxv2f64(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmops_za64_f64u10__SVBool_tu10__SVBool_tu13__SVFloat64_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.nxv2f64(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmops_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) { + SVE_ACLE_FUNC(svmops_za64,_f64,_m,)(7, pn, pm, zn, zm); +} + +// == MIXED SIGN MOPA / MOPS == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsumops_za32_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumops.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svsumops_za32_s8u10__SVBool_tu10__SVBool_tu10__SVInt8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumops.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsumops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) { + SVE_ACLE_FUNC(svsumops_za32,_s8,_m,)(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsumops_za64_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumops.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svsumops_za64_s16u10__SVBool_tu10__SVBool_tu11__SVInt16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumops.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsumops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t zm) { + SVE_ACLE_FUNC(svsumops_za64,_s16,_m,)(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svusmops_za32_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmops.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svusmops_za32_u8u10__SVBool_tu10__SVBool_tu11__SVUint8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmops.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svusmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svint8_t zm) { + SVE_ACLE_FUNC(svusmops_za32,_u8,_m,)(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svusmops_za64_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmops.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svusmops_za64_u16u10__SVBool_tu10__SVBool_tu12__SVUint16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmops.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svusmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svint16_t zm) { + SVE_ACLE_FUNC(svusmops_za64,_u16,_m,)(3, pn, pm, zn, zm); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_reads.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_reads.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_reads.c @@ -0,0 +1,964 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// == HORIZONTAL == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za8_s8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z24test_svread_hor_za8_s8_mu10__SVInt8_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint8_t test_svread_hor_za8_s8_m(svint8_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za8,_s8,_m,)(zd, pg, 0, slice_base, 15); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za8_u8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z24test_svread_hor_za8_u8_mu11__SVUint8_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint8_t test_svread_hor_za8_u8_m(svuint8_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za8,_u8,_m,)(zd, pg, 0, slice_base, 15); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_s16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za16_s16_mu11__SVInt16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svint16_t test_svread_hor_za16_s16_m(svint16_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za16,_s16,_m,)(zd, pg, 1, slice_base, 7); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_u16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za16_u16_mu12__SVUint16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint16_t test_svread_hor_za16_u16_m(svuint16_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za16,_u16,_m,)(zd, pg, 1, slice_base, 7); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_f16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za16_f16_mu13__SVFloat16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svfloat16_t test_svread_hor_za16_f16_m(svfloat16_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za16,_f16,_m,)(zd, pg, 1, slice_base, 7); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_hor_za16_bf16_mu14__SVBFloat16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_svread_hor_za16_bf16_m(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za16,_bf16,_m,)(zd, pg, 1, slice_base, 7); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za32_s32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za32_s32_mu11__SVInt32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svint32_t test_svread_hor_za32_s32_m(svint32_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za32,_s32,_m,)(zd, pg, 3, slice_base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za32_u32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za32_u32_mu12__SVUint32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint32_t test_svread_hor_za32_u32_m(svuint32_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za32,_u32,_m,)(zd, pg, 3, slice_base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za32_f32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za32_f32_mu13__SVFloat32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svfloat32_t test_svread_hor_za32_f32_m(svfloat32_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za32,_f32,_m,)(zd, pg, 3, slice_base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za64_s64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za64_s64_mu11__SVInt64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svint64_t test_svread_hor_za64_s64_m(svint64_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za64,_s64,_m,)(zd, pg, 7, slice_base, 1); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za64_u64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za64_u64_mu12__SVUint64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint64_t test_svread_hor_za64_u64_m(svuint64_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za64,_u64,_m,)(zd, pg, 7, slice_base, 1); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za64_f64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za64_f64_mu13__SVFloat64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svfloat64_t test_svread_hor_za64_f64_m(svfloat64_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za64,_f64,_m,)(zd, pg, 7, slice_base, 1); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_s8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za128_s8_mu10__SVInt8_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint8_t test_svread_hor_za128_s8_m(svint8_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za128,_s8,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_u8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za128_u8_mu11__SVUint8_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint8_t test_svread_hor_za128_u8_m(svuint8_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za128,_u8,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_s16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_hor_za128_s16_mu11__SVInt16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint16_t test_svread_hor_za128_s16_m(svint16_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za128,_s16,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_u16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_hor_za128_u16_mu12__SVUint16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint16_t test_svread_hor_za128_u16_m(svuint16_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za128,_u16,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_f16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_hor_za128_f16_mu13__SVFloat16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svfloat16_t test_svread_hor_za128_f16_m(svfloat16_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za128,_f16,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za128_bf16_mu14__SVBFloat16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svread_hor_za128_bf16_m(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za128,_bf16,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_s32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_hor_za128_s32_mu11__SVInt32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint32_t test_svread_hor_za128_s32_m(svint32_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za128,_s32,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_u32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_hor_za128_u32_mu12__SVUint32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint32_t test_svread_hor_za128_u32_m(svuint32_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za128,_u32,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_f32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_hor_za128_f32_mu13__SVFloat32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svfloat32_t test_svread_hor_za128_f32_m(svfloat32_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za128,_f32,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_s64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_hor_za128_s64_mu11__SVInt64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint64_t test_svread_hor_za128_s64_m(svint64_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za128,_s64,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_u64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_hor_za128_u64_mu12__SVUint64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint64_t test_svread_hor_za128_u64_m(svuint64_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za128,_u64,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_f64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_hor_za128_f64_mu13__SVFloat64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svfloat64_t test_svread_hor_za128_f64_m(svfloat64_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za128,_f64,_m,)(zd, pg, 15, slice_base, 0); +} + +// == VERTICAL == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za8_s8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z24test_svread_ver_za8_s8_mu10__SVInt8_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint8_t test_svread_ver_za8_s8_m(svint8_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za8,_s8,_m,)(zd, pg, 0, slice_base, 15); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za8_u8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z24test_svread_ver_za8_u8_mu11__SVUint8_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint8_t test_svread_ver_za8_u8_m(svuint8_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za8,_u8,_m,)(zd, pg, 0, slice_base, 15); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_s16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za16_s16_mu11__SVInt16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svint16_t test_svread_ver_za16_s16_m(svint16_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za16,_s16,_m,)(zd, pg, 1, slice_base, 7); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_u16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za16_u16_mu12__SVUint16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint16_t test_svread_ver_za16_u16_m(svuint16_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za16,_u16,_m,)(zd, pg, 1, slice_base, 7); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_f16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za16_f16_mu13__SVFloat16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svfloat16_t test_svread_ver_za16_f16_m(svfloat16_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za16,_f16,_m,)(zd, pg, 1, slice_base, 7); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_ver_za16_bf16_mu14__SVBFloat16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_svread_ver_za16_bf16_m(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za16,_bf16,_m,)(zd, pg, 1, slice_base, 7); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za32_s32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za32_s32_mu11__SVInt32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svint32_t test_svread_ver_za32_s32_m(svint32_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za32,_s32,_m,)(zd, pg, 3, slice_base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za32_u32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za32_u32_mu12__SVUint32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint32_t test_svread_ver_za32_u32_m(svuint32_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za32,_u32,_m,)(zd, pg, 3, slice_base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za32_f32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za32_f32_mu13__SVFloat32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svfloat32_t test_svread_ver_za32_f32_m(svfloat32_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za32,_f32,_m,)(zd, pg, 3, slice_base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za64_s64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za64_s64_mu11__SVInt64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svint64_t test_svread_ver_za64_s64_m(svint64_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za64,_s64,_m,)(zd, pg, 7, slice_base, 1); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za64_u64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za64_u64_mu12__SVUint64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint64_t test_svread_ver_za64_u64_m(svuint64_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za64,_u64,_m,)(zd, pg, 7, slice_base, 1); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za64_f64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za64_f64_mu13__SVFloat64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svfloat64_t test_svread_ver_za64_f64_m(svfloat64_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za64,_f64,_m,)(zd, pg, 7, slice_base, 1); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_s8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za128_s8_mu10__SVInt8_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint8_t test_svread_ver_za128_s8_m(svint8_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za128,_s8,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_u8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za128_u8_mu11__SVUint8_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint8_t test_svread_ver_za128_u8_m(svuint8_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za128,_u8,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_s16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_ver_za128_s16_mu11__SVInt16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint16_t test_svread_ver_za128_s16_m(svint16_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za128,_s16,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_u16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_ver_za128_u16_mu12__SVUint16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint16_t test_svread_ver_za128_u16_m(svuint16_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za128,_u16,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_f16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_ver_za128_f16_mu13__SVFloat16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svfloat16_t test_svread_ver_za128_f16_m(svfloat16_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za128,_f16,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za128_bf16_mu14__SVBFloat16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svread_ver_za128_bf16_m(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za128,_bf16,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_s32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_ver_za128_s32_mu11__SVInt32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint32_t test_svread_ver_za128_s32_m(svint32_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za128,_s32,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_u32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_ver_za128_u32_mu12__SVUint32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint32_t test_svread_ver_za128_u32_m(svuint32_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za128,_u32,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_f32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_ver_za128_f32_mu13__SVFloat32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svfloat32_t test_svread_ver_za128_f32_m(svfloat32_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za128,_f32,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_s64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_ver_za128_s64_mu11__SVInt64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint64_t test_svread_ver_za128_s64_m(svint64_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za128,_s64,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_u64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_ver_za128_u64_mu12__SVUint64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint64_t test_svread_ver_za128_u64_m(svuint64_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za128,_u64,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_f64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_ver_za128_f64_mu13__SVFloat64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svfloat64_t test_svread_ver_za128_f64_m(svfloat64_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za128,_f64,_m,)(zd, pg, 15, slice_base, 0); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c @@ -0,0 +1,98 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +// CHECK-LABEL: @test_in_streaming_mode_from_normal( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR5:[0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0 +// CHECK-NEXT: [[AND_I:%.*]] = and i64 [[TMP1]], 1 +// CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp ne i64 [[AND_I]], 0 +// CHECK-NEXT: ret i1 [[TOBOOL_I]] +// +// CPP-CHECK-LABEL: @_Z34test_in_streaming_mode_from_normalv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR5:[0-9]+]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[AND_I:%.*]] = and i64 [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp ne i64 [[AND_I]], 0 +// CPP-CHECK-NEXT: ret i1 [[TOBOOL_I]] +// +bool test_in_streaming_mode_from_normal(void) { + return __arm_in_streaming_mode(); +} + +// +// +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_in_streaming_mode_from_streaming( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR5]] +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0 +// CHECK-NEXT: [[AND_I:%.*]] = and i64 [[TMP1]], 1 +// CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp ne i64 [[AND_I]], 0 +// CHECK-NEXT: ret i1 [[TOBOOL_I]] +// +// CPP-CHECK-LABEL: @_Z37test_in_streaming_mode_from_streamingv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR5]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[AND_I:%.*]] = and i64 [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp ne i64 [[AND_I]], 0 +// CPP-CHECK-NEXT: ret i1 [[TOBOOL_I]] +// +bool test_in_streaming_mode_from_streaming(void) { + return __arm_in_streaming_mode(); +} + +// CHECK-LABEL: @test_za_disable( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @__arm_za_disable() #[[ATTR6:[0-9]+]] +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_za_disablev( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @__arm_za_disable() #[[ATTR6:[0-9]+]] +// CPP-CHECK-NEXT: ret void +// +void test_za_disable(void) { + __arm_za_disable(); +} + +// CHECK-LABEL: @test_has_sme( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR5]] +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0 +// CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp slt i64 [[TMP1]], 0 +// CHECK-NEXT: ret i1 [[TOBOOL_I]] +// +// CPP-CHECK-LABEL: @_Z12test_has_smev( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR5]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp slt i64 [[TMP1]], 0 +// CPP-CHECK-NEXT: ret i1 [[TOBOOL_I]] +// +bool test_has_sme(void) { + return __arm_has_sme(); +} + +// +// +__attribute__((arm_streaming_compatible, arm_shared_za)) +// CHECK-LABEL: @test_svundef_za( +// CHECK-NEXT: entry: +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svundef_zav( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: ret void +// +void test_svundef_za(void) { + svundef_za(); +} + diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_stores.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_stores.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_stores.c @@ -0,0 +1,500 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +// == HORIZONTAL == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_hor_za8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz( [[PG:%.*]], ptr [[BASE:%.*]], i32 0, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z18test_svst1_hor_za8ju10__SVBool_tPa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz( [[PG:%.*]], ptr [[BASE:%.*]], i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_za8(uint32_t slice_base, svbool_t pg, int8_t *base) { + svst1_hor_za8(0, slice_base, 15, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_hor_za16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 1, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svst1_hor_za16ju10__SVBool_tPs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_za16(uint32_t slice_base, svbool_t pg, int16_t *base) { + svst1_hor_za16(1, slice_base, 7, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_hor_za32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 3, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svst1_hor_za32ju10__SVBool_tPi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 3, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_za32(uint32_t slice_base, svbool_t pg, int32_t *base) { + svst1_hor_za32(3, slice_base, 3, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_hor_za64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 7, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svst1_hor_za64ju10__SVBool_tPl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 7, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_za64(uint32_t slice_base, svbool_t pg, int64_t *base) { + svst1_hor_za64(7, slice_base, 1, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_hor_za128( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz( [[TMP0]], ptr [[BASE:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst1_hor_za128ju10__SVBool_tPn( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz( [[TMP0]], ptr [[BASE:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_za128(uint32_t slice_base, svbool_t pg, __int128_t *base) { + svst1_hor_za128(15, slice_base, 0, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_hor_vnum_za8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP2]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz( [[PG:%.*]], ptr [[TMP3]], i32 0, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svst1_hor_vnum_za8ju10__SVBool_tPa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP2]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz( [[PG:%.*]], ptr [[TMP3]], i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, int8_t *base) { + svst1_hor_vnum_za8(0, slice_base, 15, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_hor_vnum_za16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz( [[TMP1]], ptr [[TMP4]], i32 1, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svst1_hor_vnum_za16ju10__SVBool_tPs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz( [[TMP1]], ptr [[TMP4]], i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, int16_t *base) { + svst1_hor_vnum_za16(1, slice_base, 7, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_hor_vnum_za32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz( [[TMP1]], ptr [[TMP4]], i32 3, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svst1_hor_vnum_za32ju10__SVBool_tPi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz( [[TMP1]], ptr [[TMP4]], i32 3, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, int32_t *base) { + svst1_hor_vnum_za32(3, slice_base, 3, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_hor_vnum_za64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz( [[TMP1]], ptr [[TMP4]], i32 7, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svst1_hor_vnum_za64ju10__SVBool_tPl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz( [[TMP1]], ptr [[TMP4]], i32 7, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, int64_t *base) { + svst1_hor_vnum_za64(7, slice_base, 1, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_hor_vnum_za128( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP2]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz( [[TMP0]], ptr [[TMP3]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svst1_hor_vnum_za128ju10__SVBool_tPn( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP2]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz( [[TMP0]], ptr [[TMP3]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, __int128_t *base) { + svst1_hor_vnum_za128(15, slice_base, 0, pg, base, 3); +} + +// == VERTICAL == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_ver_za8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.vert( [[PG:%.*]], ptr [[BASE:%.*]], i32 0, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z18test_svst1_ver_za8ju10__SVBool_tPa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.vert( [[PG:%.*]], ptr [[BASE:%.*]], i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_za8(uint32_t slice_base, svbool_t pg, int8_t *base) { + svst1_ver_za8(0, slice_base, 15, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_ver_za16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.vert( [[TMP1]], ptr [[BASE:%.*]], i32 1, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svst1_ver_za16ju10__SVBool_tPs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.vert( [[TMP1]], ptr [[BASE:%.*]], i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_za16(uint32_t slice_base, svbool_t pg, int16_t *base) { + svst1_ver_za16(1, slice_base, 7, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_ver_za32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.vert( [[TMP1]], ptr [[BASE:%.*]], i32 3, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svst1_ver_za32ju10__SVBool_tPi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.vert( [[TMP1]], ptr [[BASE:%.*]], i32 3, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_za32(uint32_t slice_base, svbool_t pg, int32_t *base) { + svst1_ver_za32(3, slice_base, 3, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_ver_za64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.vert( [[TMP1]], ptr [[BASE:%.*]], i32 7, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svst1_ver_za64ju10__SVBool_tPl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.vert( [[TMP1]], ptr [[BASE:%.*]], i32 7, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_za64(uint32_t slice_base, svbool_t pg, int64_t *base) { + svst1_ver_za64(7, slice_base, 1, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_ver_za128( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.vert( [[TMP0]], ptr [[BASE:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst1_ver_za128ju10__SVBool_tPn( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.vert( [[TMP0]], ptr [[BASE:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_za128(uint32_t slice_base, svbool_t pg, __int128_t *base) { + svst1_ver_za128(15, slice_base, 0, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_ver_vnum_za8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP2]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.vert( [[PG:%.*]], ptr [[TMP3]], i32 0, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svst1_ver_vnum_za8ju10__SVBool_tPa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP2]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.vert( [[PG:%.*]], ptr [[TMP3]], i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_vnum_za8(uint32_t slice_base, svbool_t pg, int8_t *base) { + svst1_ver_vnum_za8(0, slice_base, 15, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_ver_vnum_za16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.vert( [[TMP1]], ptr [[TMP4]], i32 1, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svst1_ver_vnum_za16ju10__SVBool_tPs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.vert( [[TMP1]], ptr [[TMP4]], i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, int16_t *base) { + svst1_ver_vnum_za16(1, slice_base, 7, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_ver_vnum_za32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.vert( [[TMP1]], ptr [[TMP4]], i32 3, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svst1_ver_vnum_za32ju10__SVBool_tPi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.vert( [[TMP1]], ptr [[TMP4]], i32 3, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, int32_t *base) { + svst1_ver_vnum_za32(3, slice_base, 3, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_ver_vnum_za64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.vert( [[TMP1]], ptr [[TMP4]], i32 7, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svst1_ver_vnum_za64ju10__SVBool_tPl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.vert( [[TMP1]], ptr [[TMP4]], i32 7, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, int64_t *base) { + svst1_ver_vnum_za64(7, slice_base, 1, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_ver_vnum_za128( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP2]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.vert( [[TMP0]], ptr [[TMP3]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svst1_ver_vnum_za128ju10__SVBool_tPn( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP2]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.vert( [[TMP0]], ptr [[TMP3]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, __int128_t *base) { + svst1_ver_vnum_za128(15, slice_base, 0, pg, base, 3); +} + +// +// +__attribute__((arm_streaming_compatible, arm_shared_za)) +// CHECK-LABEL: @test_svstr_vnum_za( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 15 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[TMP3]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z18test_svstr_vnum_zajPh( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 15 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[TMP3]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstr_vnum_za(uint32_t slice_base, uint8_t *base) { + svstr_vnum_za(slice_base, 15, base); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_writes.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_writes.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_writes.c @@ -0,0 +1,965 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// == HORIZONTAL == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za8_s8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[TMP0]], [[PG:%.*]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svwrite_hor_za8_s8_mju10__SVBool_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[TMP0]], [[PG:%.*]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za8_s8_m(uint32_t slice_base, svbool_t pg, svint8_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za8,_s8,_m,)(0, slice_base, 15, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za8_u8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[TMP0]], [[PG:%.*]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svwrite_hor_za8_u8_mju10__SVBool_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[TMP0]], [[PG:%.*]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za8_u8_m(uint32_t slice_base, svbool_t pg, svuint8_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za8,_u8,_m,)(0, slice_base, 15, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_s16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 1, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za16_s16_mju10__SVBool_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 1, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_s16_m(uint32_t slice_base, svbool_t pg, svint16_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za16,_s16,_m,)(1, slice_base, 7, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_u16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 1, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za16_u16_mju10__SVBool_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 1, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_u16_m(uint32_t slice_base, svbool_t pg, svuint16_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za16,_u16,_m,)(1, slice_base, 7, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_f16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8f16(i32 1, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za16_f16_mju10__SVBool_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8f16(i32 1, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_f16_m(uint32_t slice_base, svbool_t pg, svfloat16_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za16,_f16,_m,)(1, slice_base, 7, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8bf16(i32 1, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_hor_za16_bf16_mju10__SVBool_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8bf16(i32 1, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_bf16_m(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za16,_bf16,_m,)(1, slice_base, 7, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za32_s32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 3, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za32_s32_mju10__SVBool_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 3, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za32_s32_m(uint32_t slice_base, svbool_t pg, svint32_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za32,_s32,_m,)(3, slice_base, 3, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za32_u32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 3, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za32_u32_mju10__SVBool_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 3, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za32_u32_m(uint32_t slice_base, svbool_t pg, svuint32_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za32,_u32,_m,)(3, slice_base, 3, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za32_f32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4f32(i32 3, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za32_f32_mju10__SVBool_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4f32(i32 3, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za32_f32_m(uint32_t slice_base, svbool_t pg, svfloat32_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za32,_f32,_m,)(3, slice_base, 3, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za64_s64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 7, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za64_s64_mju10__SVBool_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 7, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za64_s64_m(uint32_t slice_base, svbool_t pg, svint64_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za64,_s64,_m,)(7, slice_base, 1, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za64_u64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 7, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za64_u64_mju10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 7, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za64_u64_m(uint32_t slice_base, svbool_t pg, svuint64_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za64,_u64,_m,)(7, slice_base, 1, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za64_f64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2f64(i32 7, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za64_f64_mju10__SVBool_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2f64(i32 7, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za64_f64_m(uint32_t slice_base, svbool_t pg, svfloat64_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za64,_f64,_m,)(7, slice_base, 1, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_s8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za128_s8_mju10__SVBool_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_s8_m(uint32_t slice_base, svbool_t pg, svint8_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za128,_s8,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_u8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za128_u8_mju10__SVBool_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_u8_m(uint32_t slice_base, svbool_t pg, svuint8_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za128,_u8,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_s16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_hor_za128_s16_mju10__SVBool_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_s16_m(uint32_t slice_base, svbool_t pg, svint16_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za128,_s16,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_u16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_hor_za128_u16_mju10__SVBool_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_u16_m(uint32_t slice_base, svbool_t pg, svuint16_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za128,_u16,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_f16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8f16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_hor_za128_f16_mju10__SVBool_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8f16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_f16_m(uint32_t slice_base, svbool_t pg, svfloat16_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za128,_f16,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8bf16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za128_bf16_mju10__SVBool_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8bf16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_bf16_m(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za128,_bf16,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_s32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_hor_za128_s32_mju10__SVBool_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_s32_m(uint32_t slice_base, svbool_t pg, svint32_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za128,_s32,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_u32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_hor_za128_u32_mju10__SVBool_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_u32_m(uint32_t slice_base, svbool_t pg, svuint32_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za128,_u32,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_f32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4f32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_hor_za128_f32_mju10__SVBool_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4f32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_f32_m(uint32_t slice_base, svbool_t pg, svfloat32_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za128,_f32,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_s64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_hor_za128_s64_mju10__SVBool_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_s64_m(uint32_t slice_base, svbool_t pg, svint64_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za128,_s64,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_u64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_hor_za128_u64_mju10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_u64_m(uint32_t slice_base, svbool_t pg, svuint64_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za128,_u64,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_f64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2f64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_hor_za128_f64_mju10__SVBool_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2f64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_f64_m(uint32_t slice_base, svbool_t pg, svfloat64_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za128,_f64,_m,)(15, slice_base, 0, pg, zn); +} + +// == VERTICAL == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za8_s8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[TMP0]], [[PG:%.*]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svwrite_ver_za8_s8_mju10__SVBool_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[TMP0]], [[PG:%.*]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za8_s8_m(uint32_t slice_base, svbool_t pg, svint8_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za8,_s8,_m,)(0, slice_base, 15, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za8_u8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[TMP0]], [[PG:%.*]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svwrite_ver_za8_u8_mju10__SVBool_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[TMP0]], [[PG:%.*]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za8_u8_m(uint32_t slice_base, svbool_t pg, svuint8_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za8,_u8,_m,)(0, slice_base, 15, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_s16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 1, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za16_s16_mju10__SVBool_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 1, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_s16_m(uint32_t slice_base, svbool_t pg, svint16_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za16,_s16,_m,)(1, slice_base, 7, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_u16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 1, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za16_u16_mju10__SVBool_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 1, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_u16_m(uint32_t slice_base, svbool_t pg, svuint16_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za16,_u16,_m,)(1, slice_base, 7, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_f16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8f16(i32 1, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za16_f16_mju10__SVBool_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8f16(i32 1, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_f16_m(uint32_t slice_base, svbool_t pg, svfloat16_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za16,_f16,_m,)(1, slice_base, 7, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8bf16(i32 1, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_ver_za16_bf16_mju10__SVBool_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8bf16(i32 1, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_bf16_m(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za16,_bf16,_m,)(1, slice_base, 7, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za32_s32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 3, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za32_s32_mju10__SVBool_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 3, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za32_s32_m(uint32_t slice_base, svbool_t pg, svint32_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za32,_s32,_m,)(3, slice_base, 3, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za32_u32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 3, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za32_u32_mju10__SVBool_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 3, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za32_u32_m(uint32_t slice_base, svbool_t pg, svuint32_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za32,_u32,_m,)(3, slice_base, 3, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za32_f32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4f32(i32 3, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za32_f32_mju10__SVBool_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4f32(i32 3, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za32_f32_m(uint32_t slice_base, svbool_t pg, svfloat32_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za32,_f32,_m,)(3, slice_base, 3, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za64_s64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 7, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za64_s64_mju10__SVBool_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 7, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za64_s64_m(uint32_t slice_base, svbool_t pg, svint64_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za64,_s64,_m,)(7, slice_base, 1, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za64_u64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 7, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za64_u64_mju10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 7, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za64_u64_m(uint32_t slice_base, svbool_t pg, svuint64_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za64,_u64,_m,)(7, slice_base, 1, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za64_f64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2f64(i32 7, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za64_f64_mju10__SVBool_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2f64(i32 7, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za64_f64_m(uint32_t slice_base, svbool_t pg, svfloat64_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za64,_f64,_m,)(7, slice_base, 1, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_s8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za128_s8_mju10__SVBool_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_s8_m(uint32_t slice_base, svbool_t pg, svint8_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za128,_s8,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_u8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za128_u8_mju10__SVBool_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_u8_m(uint32_t slice_base, svbool_t pg, svuint8_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za128,_u8,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_s16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_ver_za128_s16_mju10__SVBool_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_s16_m(uint32_t slice_base, svbool_t pg, svint16_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za128,_s16,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_u16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_ver_za128_u16_mju10__SVBool_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_u16_m(uint32_t slice_base, svbool_t pg, svuint16_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za128,_u16,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_f16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8f16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_ver_za128_f16_mju10__SVBool_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8f16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_f16_m(uint32_t slice_base, svbool_t pg, svfloat16_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za128,_f16,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8bf16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za128_bf16_mju10__SVBool_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8bf16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_bf16_m(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za128,_bf16,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_s32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_ver_za128_s32_mju10__SVBool_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_s32_m(uint32_t slice_base, svbool_t pg, svint32_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za128,_s32,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_u32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_ver_za128_u32_mju10__SVBool_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_u32_m(uint32_t slice_base, svbool_t pg, svuint32_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za128,_u32,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_f32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4f32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_ver_za128_f32_mju10__SVBool_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4f32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_f32_m(uint32_t slice_base, svbool_t pg, svfloat32_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za128,_f32,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_s64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_ver_za128_s64_mju10__SVBool_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_s64_m(uint32_t slice_base, svbool_t pg, svint64_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za128,_s64,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_u64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_ver_za128_u64_mju10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_u64_m(uint32_t slice_base, svbool_t pg, svuint64_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za128,_u64,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_f64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2f64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_ver_za128_f64_mju10__SVBool_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2f64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_f64_m(uint32_t slice_base, svbool_t pg, svfloat64_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za128,_f64,_m,)(15, slice_base, 0, pg, zn); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c @@ -0,0 +1,41 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +// +// +__attribute__((arm_streaming_compatible, arm_shared_za)) +// CHECK-LABEL: @test_svzero_mask_za_3( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.zero(i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svzero_mask_za_3v( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.zero(i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svzero_mask_za_3(void) { + svzero_mask_za(3); +} + +// +// +__attribute__((arm_streaming_compatible, arm_shared_za)) +// CHECK-LABEL: @test_svzero_za( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.zero(i32 255) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z14test_svzero_zav( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.zero(i32 255) +// CPP-CHECK-NEXT: ret void +// +void test_svzero_za(void) { + svzero_za(); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_add.c b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_add.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_add.c @@ -0,0 +1,54 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sme-i16i64 -fsyntax-only -verify %s + +#include + +__attribute__((arm_streaming, arm_shared_za)) +void test_svaddha_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svaddha_za32_s32_m(4, pn, pm, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svaddha_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svaddha_za32_u32_m(4, pn, pm, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svaddha_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svaddha_za64_s64_m(8, pn, pm, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svaddha_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svaddha_za64_u64_m(8, pn, pm, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svaddva_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svaddva_za32_s32_m(4, pn, pm, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svaddva_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svaddva_za32_u32_m(4, pn, pm, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svaddva_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svaddva_za64_s64_m(8, pn, pm, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svaddva_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svaddva_za64_u64_m(8, pn, pm, zn); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_loads.c b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_loads.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_loads.c @@ -0,0 +1,70 @@ +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -fsyntax-only -verify %s + +#include + +__attribute__((arm_streaming, arm_shared_za)) +void test_svld1_hor_za8_bad_tile(uint32_t slice_base, svbool_t pg, const int8_t *base) { + // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}} + svld1_hor_za8(1, slice_base, 0, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svld1_hor_za16_bad_tile(uint32_t slice_base, svbool_t pg, const int8_t *base) { + // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} + svld1_hor_za16(2, slice_base, 0, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svld1_hor_za32_bad_tile(uint32_t slice_base, svbool_t pg, const int8_t *base) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svld1_hor_za32(4, slice_base, 0, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svld1_ver_za64_bad_tile(uint32_t slice_base, svbool_t pg, const int8_t *base) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svld1_ver_za64(8, slice_base, 0, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svld1_ver_za128_bad_tile(uint32_t slice_base, svbool_t pg, const int8_t *base) { + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svld1_ver_za128(16, slice_base, 0, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svld1_hor_za8_slice_offset(uint32_t slice_base, svbool_t pg, const int8_t *base) { + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svld1_hor_za8(0, slice_base, 16, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svld1_ver_za16_slice_offset(uint32_t slice_base, svbool_t pg, const int8_t *base) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svld1_ver_za16(0, slice_base, 8, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svld1_hor_za32_slice_offset(uint32_t slice_base, svbool_t pg, const int8_t *base) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svld1_hor_za32(0, slice_base, 4, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svld1_ver_za64_slice_offset(uint32_t slice_base, svbool_t pg, const int8_t *base) { + // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} + svld1_ver_za64(0, slice_base, 2, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svld1_ver_za128_slice_offset(uint32_t slice_base, svbool_t pg, const int8_t *base) { + // expected-error@+1 {{argument value 16 is outside the valid range [0, 0]}} + svld1_ver_za128(0, slice_base, 16, pg, base); +} + +__attribute__((arm_streaming_compatible, arm_shared_za)) +void test_svldr_vnum_za(uint32_t slice_base, const uint8_t *base) { + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svldr_vnum_za(slice_base, 16, base); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_mop.c b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_mop.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_mop.c @@ -0,0 +1,129 @@ +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 \ +// RUN: -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -target-feature +sme-f64f64 -fsyntax-only -verify %s + +#include + +// == MOPS / MOPA == + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmopa_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svmopa_za32_bf16_m(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmopa_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svmopa_za32_f16_m(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svmopa_za32_s8_m(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svmopa_za32_u8_m(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svmopa_za64_s16_m(8, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svmopa_za64_u16_m(8, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmopa_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svmopa_za32_f32_m(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmopa_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svmopa_za64_f64_m(8, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmops_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svmops_za32_bf16_m(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmops_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svmops_za32_f16_m(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svmops_za32_s8_m(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svmops_za32_u8_m(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svmops_za64_s16_m(8, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svmops_za64_u16_m(8, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmops_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svmops_za32_f32_m(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmops_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svmops_za64_f64_m(8, pn, pm, zn, zm); +} + +// == MIXED SIGN MOPA / MOPS == + +__attribute__((arm_streaming, arm_shared_za)) +void test_svsumops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svsumops_za32_s8_m(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svsumops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t zm) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svsumops_za64_s16_m(8, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svusmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svint8_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svusmops_za32_u8_m(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svusmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svint16_t zm) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svusmops_za64_u16_m(8, pn, pm, zn, zm); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_reads.c b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_reads.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_reads.c @@ -0,0 +1,64 @@ +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -fsyntax-only -verify %s + +#include + +__attribute__((arm_streaming, arm_shared_za)) +void test_svread_hor_za8_bad_tile(svint8_t zd, svbool_t pg, uint32_t slice_base) { + // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}} + svread_hor_za8_s8_m(zd, pg, 1, slice_base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svread_ver_za16_bad_tile(svuint16_t zd, svbool_t pg, uint32_t slice_base) { + // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} + svread_ver_za16_u16_m(zd, pg, 2, slice_base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svread_ver_za32_bad_tile(svfloat32_t zd, svbool_t pg, uint32_t slice_base) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svread_ver_za32_f32_m(zd, pg, 4, slice_base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svread_hor_za64_bad_tile(svint64_t zd, svbool_t pg, uint32_t slice_base) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svread_hor_za64_s64_m(zd, pg, 8, slice_base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svread_hor_za128_bad_tile(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) { + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svread_hor_za128_bf16_m(zd, pg, 16, slice_base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svread_hor_za8_bad_slice(svint8_t zd, svbool_t pg, uint32_t slice_base) { + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svread_hor_za8_s8_m(zd, pg, 0, slice_base, 16); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svread_ver_za16_bad_slice(svuint16_t zd, svbool_t pg, uint32_t slice_base) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svread_ver_za16_u16_m(zd, pg, 1, slice_base, 8); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svread_ver_za32_bad_slice(svfloat32_t zd, svbool_t pg, uint32_t slice_base) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svread_ver_za32_f32_m(zd, pg, 3, slice_base, 4); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svread_hor_za64_bad_slice(svint64_t zd, svbool_t pg, uint32_t slice_base) { + // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} + svread_hor_za64_s64_m(zd, pg, 7, slice_base, 2); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svread_hor_za128_bad_slice(svint32_t zd, svbool_t pg, uint32_t slice_base) { + // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}} + svread_hor_za128_s32_m(zd, pg, 15, slice_base, 1); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_stores.c b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_stores.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_stores.c @@ -0,0 +1,70 @@ +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -fsyntax-only -verify %s + +#include + +__attribute__((arm_streaming, arm_shared_za)) +void test_svst1_hor_za8_bad_tile(uint32_t slice_base, svbool_t pg, int8_t *base) { + // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}} + svst1_hor_za8(1, slice_base, 0, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svst1_hor_za16_bad_tile(uint32_t slice_base, svbool_t pg, int8_t *base) { + // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} + svst1_hor_za16(2, slice_base, 0, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svst1_hor_za32_bad_tile(uint32_t slice_base, svbool_t pg, int8_t *base) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svst1_hor_za32(4, slice_base, 0, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svst1_ver_za64_bad_tile(uint32_t slice_base, svbool_t pg, int8_t *base) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svst1_ver_za64(8, slice_base, 0, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svst1_ver_za128_bad_tile(uint32_t slice_base, svbool_t pg, int8_t *base) { + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svst1_ver_za128(16, slice_base, 0, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svst1_hor_za8_slice_offset(uint32_t slice_base, svbool_t pg, int8_t *base) { + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svst1_hor_za8(0, slice_base, 16, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svst1_ver_za16_slice_offset(uint32_t slice_base, svbool_t pg, int8_t *base) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svst1_ver_za16(0, slice_base, 8, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svst1_hor_za32_slice_offset(uint32_t slice_base, svbool_t pg, int8_t *base) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svst1_hor_za32(0, slice_base, 4, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svst1_ver_za64_slice_offset(uint32_t slice_base, svbool_t pg, int8_t *base) { + // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} + svst1_ver_za64(0, slice_base, 2, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svst1_ver_za128_slice_offset(uint32_t slice_base, svbool_t pg, int8_t *base) { + // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}} + svst1_ver_za128(0, slice_base, 1, pg, base); +} + +__attribute__((arm_streaming_compatible, arm_shared_za)) +void test_svstr_vnum_za(uint32_t slice_base, uint8_t *base) { + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svstr_vnum_za(slice_base, 16, base); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_writes.c b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_writes.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_writes.c @@ -0,0 +1,64 @@ +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -fsyntax-only -verify %s + +#include + +__attribute__((arm_streaming, arm_shared_za)) +void test_svwrite_hor_za8_bad_tile(uint32_t slice_base, svbool_t pg, svint8_t zn) { + // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}} + svwrite_hor_za8_s8_m(1, slice_base, 0, pg, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svwrite_ver_za16_bad_tile(uint32_t slice_base, svbool_t pg, svuint16_t zn) { + // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} + svwrite_ver_za16_u16_m(2, slice_base, 0, pg, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svwrite_ver_za32_bad_tile(uint32_t slice_base, svbool_t pg, svfloat32_t zn) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svwrite_ver_za32_f32_m(4, slice_base, 0, pg, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svwrite_hor_za64_bad_tile(uint32_t slice_base, svbool_t pg, svint64_t zn) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svwrite_hor_za64_s64_m(8, slice_base, 0, pg, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svwrite_hor_za128_bad_tile(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) { + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svwrite_hor_za128_bf16_m(16, slice_base, 0, pg, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svwrite_hor_za8_bad_slice(uint32_t slice_base, svbool_t pg, svint8_t zn) { + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svwrite_hor_za8_s8_m(0, slice_base, 16, pg, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svwrite_ver_za16_bad_slice(uint32_t slice_base, svbool_t pg, svuint16_t zn) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svwrite_ver_za16_u16_m(1, slice_base, 8, pg, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svwrite_ver_za32_bad_slice(uint32_t slice_base, svbool_t pg, svfloat32_t zn) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svwrite_ver_za32_f32_m(3, slice_base, 4, pg, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svwrite_hor_za64_bad_slice(uint32_t slice_base, svbool_t pg, svint64_t zn) { + // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} + svwrite_hor_za64_s64_m(7, slice_base, 2, pg, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svwrite_hor_za128_bad_slice(svint32_t zd, svbool_t pg, uint32_t slice_base) { + // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}} + svwrite_hor_za128_s32_m(15, slice_base, 1, pg, zd); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_zero.c b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_zero.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_zero.c @@ -0,0 +1,10 @@ +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -fsyntax-only -verify %s + +#include + +__attribute__((arm_streaming, arm_shared_za)) +void test_svzero_mask_za(void) { + // expected-error@+1 {{argument value 256 is outside the valid range [0, 255]}} + svzero_mask_za(256); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c @@ -0,0 +1,1270 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// +// Single-Multi +// + +// x2 +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_write_single2_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svadd_write_single2_s32j11svint32x2_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_single2_s32(uint32_t slice_base, svint32x2_t zn, svint32_t zm) { + SVE_ACLE_FUNC(svadd_write,_single,_za32,_s32,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_write_single2_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svadd_write_single2_u32j12svuint32x2_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_single2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32_t zm) { + SVE_ACLE_FUNC(svadd_write,_single,_za32,_u32,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_write_single2_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svadd_write_single2_s64j11svint64x2_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_single2_s64(uint32_t slice_base, svint64x2_t zn, svint64_t zm) { + SVE_ACLE_FUNC(svadd_write,_single,_za64,_s64,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_write_single2_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svadd_write_single2_u64j12svuint64x2_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_single2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64_t zm) { + SVE_ACLE_FUNC(svadd_write,_single,_za64,_u64,_vg1x2)(slice_base, 7, zn, zm); +} + +// x4 + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_write_single4_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svadd_write_single4_s32j11svint32x4_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_single4_s32(uint32_t slice_base, svint32x4_t zn, svint32_t zm) { + SVE_ACLE_FUNC(svadd_write,_single,_za32,_s32,_vg1x4)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_write_single4_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svadd_write_single4_u32j12svuint32x4_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_single4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32_t zm) { + SVE_ACLE_FUNC(svadd_write,_single,_za32,_u32,_vg1x4)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_write_single4_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svadd_write_single4_s64j11svint64x4_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_single4_s64(uint32_t slice_base, svint64x4_t zn, svint64_t zm) { + SVE_ACLE_FUNC(svadd_write,_single,_za64,_s64,_vg1x4)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_write_single4_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svadd_write_single4_u64j12svuint64x4_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_single4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64_t zm) { + SVE_ACLE_FUNC(svadd_write,_single,_za64,_u64,_vg1x4)(slice_base, 7, zn, zm); +} + +// +// Multi-Multi +// + +// x2 + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_write_multi2_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi2_s32j11svint32x2_t11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_multi2_s32(uint32_t slice_base, svint32x2_t zn, svint32x2_t zm) { + SVE_ACLE_FUNC(svadd_write,,_za32,_s32,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_write_multi2_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi2_u32j12svuint32x2_t12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_multi2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32x2_t zm) { + SVE_ACLE_FUNC(svadd_write,,_za32,_u32,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_write_multi2_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi2_s64j11svint64x2_t11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_multi2_s64(uint32_t slice_base, svint64x2_t zn, svint64x2_t zm) { + SVE_ACLE_FUNC(svadd_write,,_za64,_s64,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_write_multi2_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi2_u64j12svuint64x2_t12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_multi2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64x2_t zm) { + SVE_ACLE_FUNC(svadd_write,,_za64,_u64,_vg1x2)(slice_base, 7, zn, zm); +} + +// x4 + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_write_multi4_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi4_s32j11svint32x4_t11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_multi4_s32(uint32_t slice_base, svint32x4_t zn, svint32x4_t zm) { + SVE_ACLE_FUNC(svadd_write,,_za32,_s32,_vg1x4)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_write_multi4_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi4_u32j12svuint32x4_t12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_multi4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32x4_t zm) { + SVE_ACLE_FUNC(svadd_write,,_za32,_u32,_vg1x4)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_write_multi4_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi4_s64j11svint64x4_t11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_multi4_s64(uint32_t slice_base, svint64x4_t zn, svint64x4_t zm) { + SVE_ACLE_FUNC(svadd_write,,_za64,_s64,_vg1x4)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_write_multi4_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi4_u64j12svuint64x4_t12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_multi4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64x4_t zm) { + SVE_ACLE_FUNC(svadd_write,,_za64,_u64,_vg1x4)(slice_base, 7, zn, zm); +} + +// +// Multi-Single Vector +// + +// x2 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z28test_svadd_vector_single2_s810svint8x2_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint8x2_t test_svadd_vector_single2_s8(svint8x2_t zn, svint8_t zm) { + return SVE_ACLE_FUNC(svadd,_single_s8_x2,,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single2_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z28test_svadd_vector_single2_u811svuint8x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint8x2_t test_svadd_vector_single2_u8(svuint8x2_t zn, svuint8_t zm) { + return SVE_ACLE_FUNC(svadd,_single_u8_x2,,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_s1611svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint16x2_t test_svadd_vector_single2_s16(svint16x2_t zn, svint16_t zm) { + return SVE_ACLE_FUNC(svadd,_single_s16_x2,,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_u1612svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint16x2_t test_svadd_vector_single2_u16(svuint16x2_t zn, svuint16_t zm) { + return SVE_ACLE_FUNC(svadd,_single_u16_x2,,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single2_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_s3211svint32x2_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint32x2_t test_svadd_vector_single2_s32(svint32x2_t zn, svint32_t zm) { + return SVE_ACLE_FUNC(svadd,_single_s32_x2,,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single2_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_u3212svuint32x2_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint32x2_t test_svadd_vector_single2_u32(svuint32x2_t zn, svuint32_t zm) { + return SVE_ACLE_FUNC(svadd,_single_u32_x2,,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single2_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_s6411svint64x2_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint64x2_t test_svadd_vector_single2_s64(svint64x2_t zn, svint64_t zm) { + return SVE_ACLE_FUNC(svadd,_single_s64_x2,,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single2_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_u6412svuint64x2_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint64x2_t test_svadd_vector_single2_u64(svuint64x2_t zn, svuint64_t zm) { + return SVE_ACLE_FUNC(svadd,_single_u64_x2,,,)(zn, zm); +} + + +// x4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z28test_svadd_vector_single4_s810svint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint8x4_t test_svadd_vector_single4_s8(svint8x4_t zn, svint8_t zm) { + return SVE_ACLE_FUNC(svadd,_single_s8_x4,,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single4_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z28test_svadd_vector_single4_u811svuint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint8x4_t test_svadd_vector_single4_u8(svuint8x4_t zn, svuint8_t zm) { + return SVE_ACLE_FUNC(svadd,_single_u8_x4,,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_s1611svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint16x4_t test_svadd_vector_single4_s16(svint16x4_t zn, svint16_t zm) { + return SVE_ACLE_FUNC(svadd,_single_s16_x4,,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_u1612svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint16x4_t test_svadd_vector_single4_u16(svuint16x4_t zn, svuint16_t zm) { + return SVE_ACLE_FUNC(svadd,_single_u16_x4,,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single4_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_s3211svint32x4_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint32x4_t test_svadd_vector_single4_s32(svint32x4_t zn, svint32_t zm) { + return SVE_ACLE_FUNC(svadd,_single_s32_x4,,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single4_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_u3212svuint32x4_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint32x4_t test_svadd_vector_single4_u32(svuint32x4_t zn, svuint32_t zm) { + return SVE_ACLE_FUNC(svadd,_single_u32_x4,,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single4_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_s6411svint64x4_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint64x4_t test_svadd_vector_single4_s64(svint64x4_t zn, svint64_t zm) { + return SVE_ACLE_FUNC(svadd,_single_s64_x4,,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single4_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_u6412svuint64x4_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint64x4_t test_svadd_vector_single4_u64(svuint64x4_t zn, svuint64_t zm) { + return SVE_ACLE_FUNC(svadd,_single_u64_x4,,,)(zn, zm); +} + +// +// Accumulate to ZA +// + +// x2 + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_za32_vg1x2_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x2_f32j13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za32_vg1x2_f32(uint32_t slice_base, svfloat32x2_t zn) { + SVE_ACLE_FUNC(svadd_za32,,_f32,,_vg1x2)(slice_base, 6, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_za32_vg1x2_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x2_s32j11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za32_vg1x2_s32(uint32_t slice_base, svint32x2_t zn) { + SVE_ACLE_FUNC(svadd_za32,,_s32,,_vg1x2)(slice_base, 6, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_za32_vg1x2_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x2_u32j12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za32_vg1x2_u32(uint32_t slice_base, svuint32x2_t zn) { + SVE_ACLE_FUNC(svadd_za32,,_u32,,_vg1x2)(slice_base, 6, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_za64_vg1x2_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x2_f64j13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za64_vg1x2_f64(uint32_t slice_base, svfloat64x2_t zn) { + SVE_ACLE_FUNC(svadd_za64,,_f64,,_vg1x2)(slice_base, 6, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_za64_vg1x2_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x2_s64j11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za64_vg1x2_s64(uint32_t slice_base, svint64x2_t zn) { + SVE_ACLE_FUNC(svadd_za64,,_s64,,_vg1x2)(slice_base, 6, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_za64_vg1x2_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x2_u64j12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za64_vg1x2_u64(uint32_t slice_base, svuint64x2_t zn) { + SVE_ACLE_FUNC(svadd_za64,,_u64,,_vg1x2)(slice_base, 6, zn); +} + +// x4 + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_za32_vg1x4_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x4_f32j13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za32_vg1x4_f32(uint32_t slice_base, svfloat32x4_t zn) { + SVE_ACLE_FUNC(svadd_za32,,_f32,,_vg1x4)(slice_base, 6, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_za32_vg1x4_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x4_s32j11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za32_vg1x4_s32(uint32_t slice_base, svint32x4_t zn) { + SVE_ACLE_FUNC(svadd_za32,,_s32,,_vg1x4)(slice_base, 6, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_za32_vg1x4_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x4_u32j12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za32_vg1x4_u32(uint32_t slice_base, svuint32x4_t zn) { + SVE_ACLE_FUNC(svadd_za32,,_u32,,_vg1x4)(slice_base, 6, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_za64_vg1x4_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x4_f64j13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za64_vg1x4_f64(uint32_t slice_base, svfloat64x4_t zn) { + SVE_ACLE_FUNC(svadd_za64,,_f64,,_vg1x4)(slice_base, 6, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_za64_vg1x4_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x4_s64j11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za64_vg1x4_s64(uint32_t slice_base, svint64x4_t zn) { + SVE_ACLE_FUNC(svadd_za64,,_s64,,_vg1x4)(slice_base, 6, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_za64_vg1x4_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x4_u64j12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za64_vg1x4_u64(uint32_t slice_base, svuint64x4_t zn) { + SVE_ACLE_FUNC(svadd_za64,,_u64,,_vg1x4)(slice_base, 6, zn); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_create2_bool.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_create2_bool.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_create2_bool.c @@ -0,0 +1,33 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK + +// REQUIRES: aarch64-registered-target + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svcreate2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[X0:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP0]], [[X1:%.*]], i64 16) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svcreate2_s8u10__SVBool_tu10__SVBool_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[X0:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP0]], [[X1:%.*]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svboolx2_t test_svcreate2_s8(svbool_t x0, svbool_t x1) +{ + return SVE_ACLE_FUNC(svcreate2,_b8,,)(x0, x1); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_create4_bool.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_create4_bool.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_create4_bool.c @@ -0,0 +1,37 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK + +// REQUIRES: aarch64-registered-target + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svcreate4_b8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( poison, [[X0:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TMP0]], [[X1:%.*]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TMP1]], [[X2:%.*]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TMP2]], [[X4:%.*]], i64 48) +// CHECK-NEXT: ret [[TMP3]] +// +// CPP-CHECK-LABEL: @_Z17test_svcreate4_b8u10__SVBool_tu10__SVBool_tu10__SVBool_tu10__SVBool_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( poison, [[X0:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TMP0]], [[X1:%.*]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TMP1]], [[X2:%.*]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TMP2]], [[X4:%.*]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP3]] +// +svboolx4_t test_svcreate4_b8(svbool_t x0, svbool_t x1, svbool_t x2, svbool_t x4) +{ + return SVE_ACLE_FUNC(svcreate4,_b8,,)(x0, x1, x2, x4); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c @@ -0,0 +1,518 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_cvt_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fcvt.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z15test_cvt_f16_x213svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fcvt.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svfloat16_t test_cvt_f16_x2(svfloat32x2_t zn) { + return SVE_ACLE_FUNC(svcvt_f16,_f32_x2,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_cvt_bf16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.bfcvt.x2( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z16test_cvt_bf16_x213svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.bfcvt.x2( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_cvt_bf16_x2(svfloat32x2_t zn) { + return SVE_ACLE_FUNC(svcvt_bf16,_f32_x2,,)(zn); +} + + +// x2 +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svcvt_f32_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fcvtu.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svcvt_f32_u32_x212svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fcvtu.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat32x2_t test_svcvt_f32_u32_x2(svuint32x2_t zn){ + return SVE_ACLE_FUNC(svcvt_f32,_u32_x2,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svcvt_f32_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fcvts.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svcvt_f32_s32_x211svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fcvts.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat32x2_t test_svcvt_f32_s32_x2(svint32x2_t zn){ + return SVE_ACLE_FUNC(svcvt_f32,_s32_x2,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svcvt_u32_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.ucvtf.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svcvt_u32_f32_x213svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.ucvtf.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint32x2_t test_svcvt_u32_f32_x2(svfloat32x2_t zn){ + return SVE_ACLE_FUNC(svcvt_u32,_f32_x2,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svcvt_s32_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.scvtf.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svcvt_s32_f32_x213svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.scvtf.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint32x2_t test_svcvt_s32_f32_x2(svfloat32x2_t zn){ + return SVE_ACLE_FUNC(svcvt_s32,_f32_x2,,)(zn); +} + +// x4 +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svcvt_f32_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fcvtu.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z21test_svcvt_f32_u32_x412svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fcvtu.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat32x4_t test_svcvt_f32_u32_x4(svuint32x4_t zn){ + return SVE_ACLE_FUNC(svcvt_f32,_u32_x4,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svcvt_f32_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fcvts.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z21test_svcvt_f32_s32_x411svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fcvts.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat32x4_t test_svcvt_f32_s32_x4(svint32x4_t zn){ + return SVE_ACLE_FUNC(svcvt_f32,_s32_x4,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svcvt_u32_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.ucvtf.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z21test_svcvt_u32_f32_x413svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.ucvtf.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint32x4_t test_svcvt_u32_f32_x4(svfloat32x4_t zn){ + return SVE_ACLE_FUNC(svcvt_u32,_f32_x4,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svcvt_s32_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.scvtf.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z21test_svcvt_s32_f32_x413svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.scvtf.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint32x4_t test_svcvt_s32_f32_x4(svfloat32x4_t zn){ + return SVE_ACLE_FUNC(svcvt_s32,_f32_x4,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_qcvt_s16_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.sqcvt.x2.nxv4i32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z20test_qcvt_s16_s32_x211svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.sqcvt.x2.nxv4i32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svint16_t test_qcvt_s16_s32_x2(svint32x2_t zn) { + return SVE_ACLE_FUNC(svqcvt_s16,_s32_x2,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_qcvt_u16_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.uqcvt.x2.nxv4i32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z20test_qcvt_u16_u32_x212svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.uqcvt.x2.nxv4i32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint16_t test_qcvt_u16_u32_x2(svuint32x2_t zn) { + return SVE_ACLE_FUNC(svqcvt_u16,_u32_x2,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_qcvt_u16_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.sqcvtu.x2.nxv4i32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z20test_qcvt_u16_s32_x211svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.sqcvtu.x2.nxv4i32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint16_t test_qcvt_u16_s32_x2(svint32x2_t zn) { + return SVE_ACLE_FUNC(svqcvt_u16,_s32_x2,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_qcvt_u8_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.uqcvt.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z19test_qcvt_u8_u32_x412svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.uqcvt.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint8_t test_qcvt_u8_u32_x4(svuint32x4_t zn) { + return SVE_ACLE_FUNC(svqcvt_u8,_u32_x4,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_qcvt_u16_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.uqcvt.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_qcvt_u16_u64_x412svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.uqcvt.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint16_t test_qcvt_u16_u64_x4(svuint64x4_t zn) { + return SVE_ACLE_FUNC(svqcvt_u16,_u64_x4,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_qcvt_s8_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqcvt.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z19test_qcvt_s8_s32_x411svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqcvt.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint8_t test_qcvt_s8_s32_x4(svint32x4_t zn) { + return SVE_ACLE_FUNC(svqcvt_s8,_s32_x4,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_qcvt_s16_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqcvt.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_qcvt_s16_s64_x411svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqcvt.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint16_t test_qcvt_s16_s64_x4(svint64x4_t zn) { + return SVE_ACLE_FUNC(svqcvt_s16,_s64_x4,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_qcvt_u8_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqcvtu.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z19test_qcvt_u8_s32_x411svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqcvtu.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint8_t test_qcvt_u8_s32_x4(svint32x4_t zn) { + return SVE_ACLE_FUNC(svqcvt_u8,_s32_x4,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_qcvt_u16_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqcvtu.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_qcvt_u16_s64_x411svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqcvtu.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint16_t test_qcvt_u16_s64_x4(svint64x4_t zn) { + return SVE_ACLE_FUNC(svqcvt_u16,_s64_x4,,)(zn); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtn.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtn.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtn.c @@ -0,0 +1,253 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_cvtn_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fcvtn.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z16test_cvtn_f16_x213svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fcvtn.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svfloat16_t test_cvtn_f16_x2(svfloat32x2_t zn) { + return SVE_ACLE_FUNC(svcvtn_f16,_f32_x2,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_cvtn_bf16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.bfcvtn.x2( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z17test_cvtn_bf16_x213svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.bfcvtn.x2( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_cvtn_bf16_x2(svfloat32x2_t zn) { + return SVE_ACLE_FUNC(svcvtn_bf16,_f32_x2,,)(zn); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_qcvtn_s16_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.sqcvtn.x2.nxv4i32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z21test_qcvtn_s16_s32_x211svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.sqcvtn.x2.nxv4i32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svint16_t test_qcvtn_s16_s32_x2(svint32x2_t zn) { + return SVE_ACLE_FUNC(svqcvtn_s16,_s32_x2,,)(zn); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_qcvtn_u16_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.uqcvtn.x2.nxv4i32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z21test_qcvtn_u16_u32_x212svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.uqcvtn.x2.nxv4i32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint16_t test_qcvtn_u16_u32_x2(svuint32x2_t zn) { + return SVE_ACLE_FUNC(svqcvtn_u16,_u32_x2,,)(zn); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_qcvtn_u16_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.sqcvtun.x2.nxv4i32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z21test_qcvtn_u16_s32_x211svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.sqcvtun.x2.nxv4i32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint16_t test_qcvtn_u16_s32_x2(svint32x2_t zn) { + return SVE_ACLE_FUNC(svqcvtn_u16,_s32_x2,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_qcvtn_u8_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.uqcvtn.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_qcvtn_u8_u32_x412svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.uqcvtn.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint8_t test_qcvtn_u8_u32_x4(svuint32x4_t zn) { + return SVE_ACLE_FUNC(svqcvtn_u8,_u32_x4,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_qcvtn_u16_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.uqcvtn.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z21test_qcvtn_u16_u64_x412svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.uqcvtn.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint16_t test_qcvtn_u16_u64_x4(svuint64x4_t zn) { + return SVE_ACLE_FUNC(svqcvtn_u16,_u64_x4,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_qcvtn_s8_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqcvtn.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_qcvtn_s8_s32_x411svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqcvtn.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint8_t test_qcvtn_s8_s32_x4(svint32x4_t zn) { + return SVE_ACLE_FUNC(svqcvtn_s8,_s32_x4,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_qcvtn_s16_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqcvtn.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z21test_qcvtn_s16_s64_x411svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqcvtn.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint16_t test_qcvtn_s16_s64_x4(svint64x4_t zn) { + return SVE_ACLE_FUNC(svqcvtn_s16,_s64_x4,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_qcvtn_u8_32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqcvtun.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z19test_qcvtn_u8_32_x411svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqcvtun.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint8_t test_qcvtn_u8_32_x4(svint32x4_t zn) { + return SVE_ACLE_FUNC(svqcvtn_u8,_s32_x4,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_qcvtn_u16_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqcvtun.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z21test_qcvtn_u16_s64_x411svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqcvtun.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint16_t test_qcvtn_u16_s64_x4(svint64x4_t zn) { + return SVE_ACLE_FUNC(svqcvtn_u16,_s64_x4,,)(zn); +} + + diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fp_dots.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fp_dots.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fp_dots.c @@ -0,0 +1,339 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// +// Multi, multi (half) +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_multi_za32_vg1x2_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.za32.vg1x2.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za32_vg1x2_f16j13svfloat16x2_t13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.za32.vg1x2.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za32_vg1x2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16x2_t zm) { + SVE_ACLE_FUNC(svdot,,_za32,_f16,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_multi_za32_vg1x4_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8f16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za32_vg1x4_f16j13svfloat16x4_t13svfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8f16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za32_vg1x4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16x4_t zm) { + SVE_ACLE_FUNC(svdot,,_za32,_f16,_vg1x4)(slice_base, 7, zn, zm); +} + + +// +// Multi, single (half) +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_single_za32_vg1x2_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x2.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svdot_single_za32_vg1x2_f16j13svfloat16x2_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x2.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za32_vg1x2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) { + SVE_ACLE_FUNC(svdot,_single,_za32,_f16,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_single_za32_vg1x4_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x4.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svdot_single_za32_vg1x4_f16j13svfloat16x4_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x4.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za32_vg1x4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) { + SVE_ACLE_FUNC(svdot,_single,_za32,_f16,_vg1x4)(slice_base, 7, zn, zm); +} + + +// +// Multi, indexed (half) +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_lane_za32_vg1x2_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x2.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za32_vg1x2_f16j13svfloat16x2_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x2.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_lane_za32_vg1x2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) { + SVE_ACLE_FUNC(svdot_lane,,_za32,_f16,_vg1x2)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_lane_za32_vg1x4_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za32_vg1x4_f16j13svfloat16x4_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_lane_za32_vg1x4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) { + SVE_ACLE_FUNC(svdot_lane,,_za32,_f16,_vg1x4)(slice_base, 7, zn, zm, 3); +} + + +// +// Multi, multi (bfloat) +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_multi_za32_vg1x2_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.za32.vg1x2.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svdot_multi_za32_vg1x2_bf16j14svbfloat16x2_t14svbfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.za32.vg1x2.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za32_vg1x2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16x2_t zm) { + SVE_ACLE_FUNC(svdot,,_za32,_bf16,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_multi_za32_vg1x4_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8bf16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svdot_multi_za32_vg1x4_bf16j14svbfloat16x4_t14svbfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8bf16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za32_vg1x4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16x4_t zm) { + SVE_ACLE_FUNC(svdot,,_za32,_bf16,_vg1x4)(slice_base, 7, zn, zm); +} + + +// +// Multi, single (bfloat) +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_single_za32_vg1x2_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x2.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z33test_svdot_single_za32_vg1x2_bf16j14svbfloat16x2_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x2.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za32_vg1x2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) { + SVE_ACLE_FUNC(svdot,_single,_za32,_bf16,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_single_za32_vg1x4_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x4.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z33test_svdot_single_za32_vg1x4_bf16j14svbfloat16x4_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x4.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za32_vg1x4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) { + SVE_ACLE_FUNC(svdot,_single,_za32,_bf16,_vg1x4)(slice_base, 7, zn, zm); +} + + +// +// Multi, indexed (bfloat) +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_lane_za32_vg1x2_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x2.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svdot_lane_za32_vg1x2_bf16j14svbfloat16x2_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x2.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_lane_za32_vg1x2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) { + SVE_ACLE_FUNC(svdot_lane,,_za32,_bf16,_vg1x2)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_lane_za32_vg1x4_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svdot_lane_za32_vg1x4_bf16j14svbfloat16x4_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_lane_za32_vg1x4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) { + SVE_ACLE_FUNC(svdot_lane,,_za32,_bf16,_vg1x4)(slice_base, 7, zn, zm, 3); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c @@ -0,0 +1,290 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// FRINTA + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svfrinta_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.frinta.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z20test_svfrinta_f32_x213svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.frinta.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat32x2_t test_svfrinta_f32_x2(svfloat32x2_t zn) { + return SVE_ACLE_FUNC(svrinta,_f32_x2,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svfrinta_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.frinta.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z20test_svfrinta_f32_x413svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.frinta.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat32x4_t test_svfrinta_f32_x4(svfloat32x4_t zn) { + return SVE_ACLE_FUNC(svrinta,_f32_x4,,)(zn); +} + +// FRINTM + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svfrintam_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.frintm.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svfrintam_f32_x213svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.frintm.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat32x2_t test_svfrintam_f32_x2(svfloat32x2_t zn) { + return SVE_ACLE_FUNC(svrintm,_f32_x2,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svfrintm_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.frintm.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z20test_svfrintm_f32_x413svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.frintm.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat32x4_t test_svfrintm_f32_x4(svfloat32x4_t zn) { + return SVE_ACLE_FUNC(svrintm,_f32_x4,,)(zn); +} + +// FRINTN + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svfrintn_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.frintn.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z20test_svfrintn_f32_x213svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.frintn.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat32x2_t test_svfrintn_f32_x2(svfloat32x2_t zn) { + return SVE_ACLE_FUNC(svrintn,_f32_x2,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svfrintn_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.frintn.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z20test_svfrintn_f32_x413svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.frintn.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat32x4_t test_svfrintn_f32_x4(svfloat32x4_t zn) { + return SVE_ACLE_FUNC(svrintn,_f32_x4,,)(zn); +} + +// FRINTP + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svfrintp_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.frintp.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z20test_svfrintp_f32_x213svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.frintp.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat32x2_t test_svfrintp_f32_x2(svfloat32x2_t zn) { + return SVE_ACLE_FUNC(svrintp,_f32_x2,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svfrintp_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.frintp.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z20test_svfrintp_f32_x413svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.frintp.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat32x4_t test_svfrintp_f32_x4(svfloat32x4_t zn) { + return SVE_ACLE_FUNC(svrintp,_f32_x4,,)(zn); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_get2-bool.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_get2-bool.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_get2-bool.c @@ -0,0 +1,45 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svget2_b8_0( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv32i1( [[TUPLE:%.*]], i64 0) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svget2_b8_010svboolx2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv32i1( [[TUPLE:%.*]], i64 0) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbool_t test_svget2_b8_0(svboolx2_t tuple) +{ + return SVE_ACLE_FUNC(svget2,_b8,,)(tuple, 0); +} + +// CHECK-LABEL: @test_svget2_b8_1( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv32i1( [[TUPLE:%.*]], i64 16) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svget2_b8_110svboolx2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv32i1( [[TUPLE:%.*]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbool_t test_svget2_b8_1(svboolx2_t tuple) +{ + return SVE_ACLE_FUNC(svget2,_b8,,)(tuple, 1); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_get4-bool.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_get4-bool.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_get4-bool.c @@ -0,0 +1,66 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// NOTE: For these tests clang converts the struct parameter into +// several parameters, one for each member of the original struct. +// CHECK-LABEL: @test_svget4_b8_0( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv64i1( [[TUPLE:%.*]], i64 0) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svget4_b8_010svboolx4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv64i1( [[TUPLE:%.*]], i64 0) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbool_t test_svget4_b8_0(svboolx4_t tuple) +{ + return SVE_ACLE_FUNC(svget4,_b8,,)(tuple, 0); +} + +// NOTE: For these tests clang converts the struct parameter into +// several parameters, one for each member of the original struct. +// CHECK-LABEL: @test_svget4_b8_1( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv64i1( [[TUPLE:%.*]], i64 16) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svget4_b8_110svboolx4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv64i1( [[TUPLE:%.*]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbool_t test_svget4_b8_1(svboolx4_t tuple) +{ + return SVE_ACLE_FUNC(svget4,_b8,,)(tuple, 1); +} + +// NOTE: For these tests clang converts the struct parameter into +// several parameters, one for each member of the original struct. +// CHECK-LABEL: @test_svget4_b8_3( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv64i1( [[TUPLE:%.*]], i64 48) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svget4_b8_310svboolx4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv64i1( [[TUPLE:%.*]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbool_t test_svget4_b8_3(svboolx4_t tuple) +{ + return SVE_ACLE_FUNC(svget4,_b8,,)(tuple, 3); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_int_dots.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_int_dots.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_int_dots.c @@ -0,0 +1,1205 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// +// Multi, multi (unsigned) +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_multi_za32_vg1x2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za32_vg1x2_u16j12svuint16x2_t12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za32_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) { + SVE_ACLE_FUNC(svdot,,_za32,_u16,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_multi_za32_vg1x4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za32_vg1x4_u16j12svuint16x4_t12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za32_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) { + SVE_ACLE_FUNC(svdot,,_za32,_u16,_vg1x4)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_multi_za32_vg1x2_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svdot_multi_za32_vg1x2_u8j11svuint8x2_t11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8x2_t zm) { + SVE_ACLE_FUNC(svdot,,_za32,_u8,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_multi_za32_vg1x4_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv16i8(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svdot_multi_za32_vg1x4_u8j11svuint8x4_t11svuint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv16i8(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8x4_t zm) { + SVE_ACLE_FUNC(svdot,,_za32,_u8,_vg1x4)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_multi_za64_vg1x2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.za64.vg1x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za64_vg1x2_u16j12svuint16x2_t12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.za64.vg1x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za64_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) { + SVE_ACLE_FUNC(svdot,,_za64,_u16,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_multi_za64_vg1x4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.za64.vg1x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za64_vg1x4_u16j12svuint16x4_t12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.za64.vg1x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za64_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) { + SVE_ACLE_FUNC(svdot,,_za64,_u16,_vg1x4)(slice_base, 7, zn, zm); +} + + +// +// Multi, multi (signed) +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_multi_za32_vg1x2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za32_vg1x2_s16j11svint16x2_t11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za32_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) { + SVE_ACLE_FUNC(svdot,,_za32,_s16,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_multi_za32_vg1x4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za32_vg1x4_s16j11svint16x4_t11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za32_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) { + SVE_ACLE_FUNC(svdot,,_za32,_s16,_vg1x4)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_multi_za32_vg1x2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svdot_multi_za32_vg1x2_s8j10svint8x2_t10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svint8x2_t zm) { + SVE_ACLE_FUNC(svdot,,_za32,_s8,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_multi_za32_vg1x4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv16i8(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svdot_multi_za32_vg1x4_s8j10svint8x4_t10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv16i8(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svint8x4_t zm) { + SVE_ACLE_FUNC(svdot,,_za32,_s8,_vg1x4)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_multi_za64_vg1x2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.za64.vg1x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za64_vg1x2_s16j11svint16x2_t11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.za64.vg1x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za64_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) { + SVE_ACLE_FUNC(svdot,,_za64,_s16,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_multi_za64_vg1x4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.za64.vg1x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za64_vg1x4_s16j11svint16x4_t11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.za64.vg1x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za64_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) { + SVE_ACLE_FUNC(svdot,,_za64,_s16,_vg1x4)(slice_base, 7, zn, zm); +} + + +// +// Multi, single (unsigned) +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_single_za32_vg1x2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svdot_single_za32_vg1x2_u16j12svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za32_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) { + SVE_ACLE_FUNC(svdot,_single,_za32,_u16,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_single_za32_vg1x4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svdot_single_za32_vg1x4_u16j12svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za32_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) { + SVE_ACLE_FUNC(svdot,_single,_za32,_u16,_vg1x4)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_single_za32_vg1x2_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svdot_single_za32_vg1x2_u8j11svuint8x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) { + SVE_ACLE_FUNC(svdot,_single,_za32,_u8,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_single_za32_vg1x4_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svdot_single_za32_vg1x4_u8j11svuint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) { + SVE_ACLE_FUNC(svdot,_single,_za32,_u8,_vg1x4)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_single_za64_vg1x2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.single.za64.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svdot_single_za64_vg1x2_u16j12svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.single.za64.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za64_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) { + SVE_ACLE_FUNC(svdot,_single,_za64,_u16,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_single_za64_vg1x4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.single.za64.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svdot_single_za64_vg1x4_u16j12svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.single.za64.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za64_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) { + SVE_ACLE_FUNC(svdot,_single,_za64,_u16,_vg1x4)(slice_base, 7, zn, zm); +} + + +// +// Multi, single (signed) +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_single_za32_vg1x2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svdot_single_za32_vg1x2_s16j11svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za32_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) { + SVE_ACLE_FUNC(svdot,_single,_za32,_s16,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_single_za32_vg1x4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svdot_single_za32_vg1x4_s16j11svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za32_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) { + SVE_ACLE_FUNC(svdot,_single,_za32,_s16,_vg1x4)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_single_za32_vg1x2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svdot_single_za32_vg1x2_s8j10svint8x2_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) { + SVE_ACLE_FUNC(svdot,_single,_za32,_s8,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_single_za32_vg1x4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svdot_single_za32_vg1x4_s8j10svint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) { + SVE_ACLE_FUNC(svdot,_single,_za32,_s8,_vg1x4)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_single_za64_vg1x2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.single.za64.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svdot_single_za64_vg1x2_s16j11svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.single.za64.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za64_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) { + SVE_ACLE_FUNC(svdot,_single,_za64,_s16,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_single_za64_vg1x4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.single.za64.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svdot_single_za64_vg1x4_s16j11svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.single.za64.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za64_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) { + SVE_ACLE_FUNC(svdot,_single,_za64,_s16,_vg1x4)(slice_base, 7, zn, zm); +} + + + + + + +// +// Multi, indexed (unsigned) +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_lane_za32_vg1x2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za32_vg1x2_u16j12svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_lane_za32_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) { + SVE_ACLE_FUNC(svdot_lane,,_za32,_u16,_vg1x2)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_lane_za32_vg1x4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za32_vg1x4_u16j12svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_lane_za32_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) { + SVE_ACLE_FUNC(svdot_lane,,_za32,_u16,_vg1x4)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_lane_za32_vg1x2_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svdot_lane_za32_vg1x2_u8j11svuint8x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_lane_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) { + SVE_ACLE_FUNC(svdot_lane,,_za32,_u8,_vg1x2)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_lane_za32_vg1x4_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svdot_lane_za32_vg1x4_u8j11svuint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_lane_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) { + SVE_ACLE_FUNC(svdot_lane,,_za32,_u8,_vg1x4)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_lane_za64_vg1x2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.lane.za64.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 1) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za64_vg1x2_u16j12svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.lane.za64.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 1) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_lane_za64_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) { + SVE_ACLE_FUNC(svdot_lane,,_za64,_u16,_vg1x2)(slice_base, 7, zn, zm, 1); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_lane_za64_vg1x4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.lane.za64.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 1) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za64_vg1x4_u16j12svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.lane.za64.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 1) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_lane_za64_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) { + SVE_ACLE_FUNC(svdot_lane,,_za64,_u16,_vg1x4)(slice_base, 7, zn, zm, 1); +} + + +// +// Multi, indexed (signed) +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_lane_za32_vg1x2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za32_vg1x2_s16j11svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_lane_za32_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) { + SVE_ACLE_FUNC(svdot_lane,,_za32,_s16,_vg1x2)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_lane_za32_vg1x4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za32_vg1x4_s16j11svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_lane_za32_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) { + SVE_ACLE_FUNC(svdot_lane,,_za32,_s16,_vg1x4)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_lane_za32_vg1x2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svdot_lane_za32_vg1x2_s8j10svint8x2_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_lane_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) { + SVE_ACLE_FUNC(svdot_lane,,_za32,_s8,_vg1x2)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_lane_za32_vg1x4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svdot_lane_za32_vg1x4_s8j10svint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_lane_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) { + SVE_ACLE_FUNC(svdot_lane,,_za32,_s8,_vg1x4)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_lane_za64_vg1x2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.lane.za64.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 1) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za64_vg1x2_s16j11svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.lane.za64.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 1) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_lane_za64_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) { + SVE_ACLE_FUNC(svdot_lane,,_za64,_s16,_vg1x2)(slice_base, 7, zn, zm, 1); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_lane_za64_vg1x4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.lane.za64.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 1) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za64_vg1x4_s16j11svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.lane.za64.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 1) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_lane_za64_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) { + SVE_ACLE_FUNC(svdot_lane,,_za64,_s16,_vg1x4)(slice_base, 7, zn, zm, 1); +} + + +// +// Multi, multi (unsigned by signed) +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svusdot_multi_za32_vg1x2_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usdot.za32.vg1x2.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svusdot_multi_za32_vg1x2_u8j11svuint8x2_t10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usdot.za32.vg1x2.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svusdot_multi_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8x2_t zm) { + SVE_ACLE_FUNC(svusdot,,_za32,_u8,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svusdot_multi_za32_vg1x4_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usdot.za32.vg1x4.nxv16i8(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svusdot_multi_za32_vg1x4_u8j11svuint8x4_t10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usdot.za32.vg1x4.nxv16i8(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svusdot_multi_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8x4_t zm) { + SVE_ACLE_FUNC(svusdot,,_za32,_u8,_vg1x4)(slice_base, 7, zn, zm); +} + + +// +// Multi, single (unsigned by signed) +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svusdot_single_za32_vg1x2_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usdot.single.za32.vg1x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z33test_svusdot_single_za32_vg1x2_u8j11svuint8x2_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usdot.single.za32.vg1x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svusdot_single_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8_t zm) { + SVE_ACLE_FUNC(svusdot,_single,_za32,_u8,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svusdot_single_za32_vg1x4_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usdot.single.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z33test_svusdot_single_za32_vg1x4_u8j11svuint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usdot.single.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svusdot_single_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8_t zm) { + SVE_ACLE_FUNC(svusdot,_single,_za32,_u8,_vg1x4)(slice_base, 7, zn, zm); +} + + +// +// Multi, indexed (unsigned by signed) +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svusdot_lane_za32_vg1x2_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svusdot_lane_za32_vg1x2_u8j11svuint8x2_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svusdot_lane_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8_t zm) { + SVE_ACLE_FUNC(svusdot_lane,,_za32,_u8,_vg1x2)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svusdot_lane_za32_vg1x4_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svusdot_lane_za32_vg1x4_u8j11svuint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svusdot_lane_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8_t zm) { + SVE_ACLE_FUNC(svusdot_lane,,_za32,_u8,_vg1x4)(slice_base, 7, zn, zm, 3); +} + + +// +// Multi, single (signed by unsigned) +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsudot_single_za32_vg1x2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sudot.single.za32.vg1x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z33test_svsudot_single_za32_vg1x2_s8j10svint8x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sudot.single.za32.vg1x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsudot_single_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8_t zm) { + SVE_ACLE_FUNC(svsudot,_single,_za32,_s8,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsudot_single_za32_vg1x4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sudot.single.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z33test_svsudot_single_za32_vg1x4_s8j10svint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sudot.single.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsudot_single_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8_t zm) { + SVE_ACLE_FUNC(svsudot,_single,_za32,_s8,_vg1x4)(slice_base, 7, zn, zm); +} + + +// +// Multi, indexed (signed by unsigned) +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsudot_lane_za32_vg1x2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svsudot_lane_za32_vg1x2_s8j10svint8x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svsudot_lane_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8_t zm) { + SVE_ACLE_FUNC(svsudot_lane,,_za32,_s8,_vg1x2)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsudot_lane_za32_vg1x4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svsudot_lane_za32_vg1x4_s8j10svint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svsudot_lane_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8_t zm) { + SVE_ACLE_FUNC(svsudot_lane,,_za32,_s8,_vg1x4)(slice_base, 7, zn, zm, 3); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_ldr_str_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_ldr_str_zt.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_ldr_str_zt.c @@ -0,0 +1,52 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED) A1 +#else +#define SVE_ACLE_FUNC(A1,A2) A1##A2 +#endif + +// LDR ZT0 + +__attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za)) +// CHECK-LABEL: @test_svldr_zt( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z13test_svldr_ztPKv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svldr_zt(const void *base) { + svldr_zt(0, base); +} + +// STR ZT0 + +__attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za)) +// CHECK-LABEL: @test_svstr_zt( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.str.zt(i32 0, ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z13test_svstr_ztPv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.str.zt(i32 0, ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstr_zt(void *base) { + svstr_zt(0, base); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt.c @@ -0,0 +1,58 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + +// CHECK-LABEL: @test_svluti2_lane_zt_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.luti2.lane.zt.nxv16i8(i32 0, [[ZN:%.*]], i32 2) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z23test_svluti2_lane_zt_u8u11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.luti2.lane.zt.nxv16i8(i32 0, [[ZN:%.*]], i32 2) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint8_t test_svluti2_lane_zt_u8(svuint8_t zn) { + return svluti2_lane_zt_u8(0, zn, 2); +} + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + +// CHECK-LABEL: @test_svluti2_lane_zt_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.luti2.lane.zt.nxv8i16(i32 0, [[ZN:%.*]], i32 2) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z24test_svluti2_lane_zt_u16u12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.luti2.lane.zt.nxv8i16(i32 0, [[ZN:%.*]], i32 2) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint16_t test_svluti2_lane_zt_u16(svuint16_t zn) { + return svluti2_lane_zt_u16(0, zn, 2); +} + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + + +// CHECK-LABEL: @test_svluti2_lane_zt_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.luti2.lane.zt.nxv4i32(i32 0, [[ZN:%.*]], i32 2) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z24test_svluti2_lane_zt_u32u12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.luti2.lane.zt.nxv4i32(i32 0, [[ZN:%.*]], i32 2) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint32_t test_svluti2_lane_zt_u32(svuint32_t zn) { + return svluti2_lane_zt_u32(0, zn, 2); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c @@ -0,0 +1,91 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + +// CHECK-LABEL: @test_svluti2_lane_zt_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv16i8(i32 0, [[ZN:%.*]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z23test_svluti2_lane_zt_u8u11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv16i8(i32 0, [[ZN:%.*]], i32 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint8x2_t test_svluti2_lane_zt_u8(svuint8_t zn) { + return SVE_ACLE_FUNC(svluti2_lane_zt,_u8,_x2,)(0, zn, 0); +} + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + +// CHECK-LABEL: @test_svluti2_lane_zt_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv8i16(i32 0, [[ZN:%.*]], i32 7) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z24test_svluti2_lane_zt_u16u12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv8i16(i32 0, [[ZN:%.*]], i32 7) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint16x2_t test_svluti2_lane_zt_u16(svuint16_t zn) { + return SVE_ACLE_FUNC(svluti2_lane_zt,_u16,_x2,)(0, zn, 7); +} + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + + +// CHECK-LABEL: @test_svluti2_lane_zt_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv4i32(i32 0, [[ZN:%.*]], i32 7) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z24test_svluti2_lane_zt_u32u12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv4i32(i32 0, [[ZN:%.*]], i32 7) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint32x2_t test_svluti2_lane_zt_u32(svuint32_t zn) { + return SVE_ACLE_FUNC(svluti2_lane_zt,_u32,_x2,)(0, zn, 7); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c @@ -0,0 +1,117 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + + +// CHECK-LABEL: @test_svluti2_lane_zt_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv16i8(i32 0, [[ZN:%.*]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z23test_svluti2_lane_zt_u8u11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv16i8(i32 0, [[ZN:%.*]], i32 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint8x4_t test_svluti2_lane_zt_u8(svuint8_t zn) { + return SVE_ACLE_FUNC(svluti2_lane_zt,_u8,_x4,)(0, zn, 0); +} + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + +// CHECK-LABEL: @test_svluti2_lane_zt_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv8i16(i32 0, [[ZN:%.*]], i32 3) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z24test_svluti2_lane_zt_u16u12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv8i16(i32 0, [[ZN:%.*]], i32 3) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint16x4_t test_svluti2_lane_zt_u16(svuint16_t zn) { + return SVE_ACLE_FUNC(svluti2_lane_zt,_u16,_x4,)(0, zn, 3); +} + + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + + +// CHECK-LABEL: @test_svluti2_lane_zt_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv4i32(i32 0, [[ZN:%.*]], i32 3) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z24test_svluti2_lane_zt_u32u12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv4i32(i32 0, [[ZN:%.*]], i32 3) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint32x4_t test_svluti2_lane_zt_u32(svuint32_t zn) { + return SVE_ACLE_FUNC(svluti2_lane_zt,_u32,_x4,)(0, zn, 3); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt.c @@ -0,0 +1,57 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + +// CHECK-LABEL: @test_svluti4_lane_zt_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.luti4.lane.zt.nxv16i8(i32 0, [[ZN:%.*]], i32 2) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z23test_svluti4_lane_zt_u8u11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.luti4.lane.zt.nxv16i8(i32 0, [[ZN:%.*]], i32 2) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint8_t test_svluti4_lane_zt_u8(svuint8_t zn) { + return svluti4_lane_zt_u8(0, zn, 2); +} + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + +// CHECK-LABEL: @test_svluti4_lane_zt_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.luti4.lane.zt.nxv8i16(i32 0, [[ZN:%.*]], i32 2) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z24test_svluti4_lane_zt_u16u12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.luti4.lane.zt.nxv8i16(i32 0, [[ZN:%.*]], i32 2) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint16_t test_svluti4_lane_zt_u16(svuint16_t zn) { + return svluti4_lane_zt_u16(0, zn, 2); +} + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + +// CHECK-LABEL: @test_svluti4_lane_zt_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.luti4.lane.zt.nxv4i32(i32 0, [[ZN:%.*]], i32 2) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z24test_svluti4_lane_zt_u32u12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.luti4.lane.zt.nxv4i32(i32 0, [[ZN:%.*]], i32 2) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint32_t test_svluti4_lane_zt_u32(svuint32_t zn) { + return svluti4_lane_zt_u32(0, zn, 2); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c @@ -0,0 +1,96 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2 + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + +// CHECK-LABEL: define dso_local @test_svluti4_lane_zt_u8 +// CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv16i8(i32 0, [[ZN]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: define dso_local @_Z23test_svluti4_lane_zt_u8u11__SVUint8_t +// CPP-CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0:[0-9]+]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv16i8(i32 0, [[ZN]], i32 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint8x2_t test_svluti4_lane_zt_u8(svuint8_t zn) { + return SVE_ACLE_FUNC(svluti4_lane_zt,_u8,_x2,)(0, zn, 0); +} + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + +// CHECK-LABEL: define dso_local @test_svluti4_lane_zt_u16 +// CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv8i16(i32 0, [[ZN]], i32 3) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: define dso_local @_Z24test_svluti4_lane_zt_u16u12__SVUint16_t +// CPP-CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv8i16(i32 0, [[ZN]], i32 3) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint16x2_t test_svluti4_lane_zt_u16(svuint16_t zn) { + return SVE_ACLE_FUNC(svluti4_lane_zt,_u16,_x2,)(0, zn, 3); +} + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + +// CHECK-LABEL: define dso_local @test_svluti4_lane_zt_u32 +// CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv4i32(i32 0, [[ZN]], i32 3) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: define dso_local @_Z24test_svluti4_lane_zt_u32u12__SVUint32_t +// CPP-CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv4i32(i32 0, [[ZN]], i32 3) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint32x2_t test_svluti4_lane_zt_u32(svuint32_t zn) { + return SVE_ACLE_FUNC(svluti4_lane_zt,_u32,_x2,)(0, zn, 3); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c @@ -0,0 +1,86 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2 + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + +// CHECK-LABEL: define dso_local @test_svluti4_lane_zt_u16 +// CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8i16(i32 0, [[ZN]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: define dso_local @_Z24test_svluti4_lane_zt_u16u12__SVUint16_t +// CPP-CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0:[0-9]+]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8i16(i32 0, [[ZN]], i32 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint16x4_t test_svluti4_lane_zt_u16(svuint16_t zn) { + return SVE_ACLE_FUNC(svluti4_lane_zt,_u16,_x4,)(0, zn, 0); +} + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + +// CHECK-LABEL: define dso_local @test_svluti4_lane_zt_u32 +// CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv4i32(i32 0, [[ZN]], i32 1) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: define dso_local @_Z24test_svluti4_lane_zt_u32u12__SVUint32_t +// CPP-CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv4i32(i32 0, [[ZN]], i32 1) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint32x4_t test_svluti4_lane_zt_u32(svuint32_t zn) { + return SVE_ACLE_FUNC(svluti4_lane_zt,_u32,_x4,)(0, zn, 1); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_max.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_max.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_max.c @@ -0,0 +1,1607 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED) A1 +#else +#define SVE_ACLE_FUNC(A1,A2) A1##A2 +#endif + +// Single, x2 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.smax.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z23test_svmax_single_s8_x210svint8x2_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.smax.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint8x2_t test_svmax_single_s8_x2(svint8x2_t zdn, svint8_t zm) { + return SVE_ACLE_FUNC(svmax,_single_s8_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.smax.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_s16_x211svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.smax.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint16x2_t test_svmax_single_s16_x2(svint16x2_t zdn, svint16_t zm) { + return SVE_ACLE_FUNC(svmax,_single_s16_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.smax.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_s32_x211svint32x2_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.smax.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint32x2_t test_svmax_single_s32_x2(svint32x2_t zdn, svint32_t zm) { + return SVE_ACLE_FUNC(svmax,_single_s32_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.smax.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_s64_x211svint64x2_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.smax.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint64x2_t test_svmax_single_s64_x2(svint64x2_t zdn, svint64_t zm) { + return SVE_ACLE_FUNC(svmax,_single_s64_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.umax.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z23test_svmax_single_u8_x211svuint8x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.umax.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint8x2_t test_svmax_single_u8_x2(svuint8x2_t zdn, svuint8_t zm) { + return SVE_ACLE_FUNC(svmax,_single_u8_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.umax.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_u16_x212svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.umax.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint16x2_t test_svmax_single_u16_x2(svuint16x2_t zdn, svuint16_t zm) { + return SVE_ACLE_FUNC(svmax,_single_u16_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.umax.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_u32_x212svuint32x2_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.umax.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint32x2_t test_svmax_single_u32_x2(svuint32x2_t zdn, svuint32_t zm) { + return SVE_ACLE_FUNC(svmax,_single_u32_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.umax.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_u64_x212svuint64x2_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.umax.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint64x2_t test_svmax_single_u64_x2(svuint64x2_t zdn, svuint64_t zm) { + return SVE_ACLE_FUNC(svmax,_single_u64_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmax.single.x2.nxv8f16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_f16_x213svfloat16x2_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmax.single.x2.nxv8f16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat16x2_t test_svmax_single_f16_x2(svfloat16x2_t zdn, svfloat16_t zm) { + return SVE_ACLE_FUNC(svmax,_single_f16_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmax.single.x2.nxv4f32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_f32_x213svfloat32x2_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmax.single.x2.nxv4f32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat32x2_t test_svmax_single_f32_x2(svfloat32x2_t zdn, svfloat32_t zm) { + return SVE_ACLE_FUNC(svmax,_single_f32_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmax.single.x2.nxv2f64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_f64_x213svfloat64x2_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmax.single.x2.nxv2f64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat64x2_t test_svmax_single_f64_x2(svfloat64x2_t zdn, svfloat64_t zm) { + return SVE_ACLE_FUNC(svmax,_single_f64_x2)(zdn, zm); +} + +// Single, x4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.smax.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z23test_svmax_single_s8_x410svint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.smax.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint8x4_t test_svmax_single_s8_x4(svint8x4_t zdn, svint8_t zm) { + return SVE_ACLE_FUNC(svmax,_single_s8_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.smax.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_s16_x411svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.smax.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint16x4_t test_svmax_single_s16_x4(svint16x4_t zdn, svint16_t zm) { + return SVE_ACLE_FUNC(svmax,_single_s16_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.smax.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_s32_x411svint32x4_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.smax.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint32x4_t test_svmax_single_s32_x4(svint32x4_t zdn, svint32_t zm) { + return SVE_ACLE_FUNC(svmax,_single_s32_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.smax.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_s64_x411svint64x4_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.smax.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint64x4_t test_svmax_single_s64_x4(svint64x4_t zdn, svint64_t zm) { + return SVE_ACLE_FUNC(svmax,_single_s64_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.umax.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z23test_svmax_single_u8_x411svuint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.umax.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint8x4_t test_svmax_single_u8_x4(svuint8x4_t zdn, svuint8_t zm) { + return SVE_ACLE_FUNC(svmax,_single_u8_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.umax.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_u16_x412svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.umax.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint16x4_t test_svmax_single_u16_x4(svuint16x4_t zdn, svuint16_t zm) { + return SVE_ACLE_FUNC(svmax,_single_u16_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.umax.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_u32_x412svuint32x4_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.umax.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint32x4_t test_svmax_single_u32_x4(svuint32x4_t zdn, svuint32_t zm) { + return SVE_ACLE_FUNC(svmax,_single_u32_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.umax.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_u64_x412svuint64x4_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.umax.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint64x4_t test_svmax_single_u64_x4(svuint64x4_t zdn, svuint64_t zm) { + return SVE_ACLE_FUNC(svmax,_single_u64_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmax.single.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_f16_x413svfloat16x4_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmax.single.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat16x4_t test_svmax_single_f16_x4(svfloat16x4_t zdn, svfloat16_t zm) { + return SVE_ACLE_FUNC(svmax,_single_f16_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmax.single.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_f32_x413svfloat32x4_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmax.single.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat32x4_t test_svmax_single_f32_x4(svfloat32x4_t zdn, svfloat32_t zm) { + return SVE_ACLE_FUNC(svmax,_single_f32_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmax.single.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_f64_x413svfloat64x4_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmax.single.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat64x4_t test_svmax_single_f64_x4(svfloat64x4_t zdn, svfloat64_t zm) { + return SVE_ACLE_FUNC(svmax,_single_f64_x4)(zdn, zm); +} + +// Multi, x2 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.smax.x2.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z16test_svmax_s8_x210svint8x2_t10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.smax.x2.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint8x2_t test_svmax_s8_x2(svint8x2_t zdn, svint8x2_t zm) { + return SVE_ACLE_FUNC(svmax,_s8_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.smax.x2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_s16_x211svint16x2_t11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.smax.x2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint16x2_t test_svmax_s16_x2(svint16x2_t zdn, svint16x2_t zm) { + return SVE_ACLE_FUNC(svmax,_s16_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.smax.x2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_s32_x211svint32x2_t11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.smax.x2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint32x2_t test_svmax_s32_x2(svint32x2_t zdn, svint32x2_t zm) { + return SVE_ACLE_FUNC(svmax,_s32_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.smax.x2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_s64_x211svint64x2_t11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.smax.x2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint64x2_t test_svmax_s64_x2(svint64x2_t zdn, svint64x2_t zm) { + return SVE_ACLE_FUNC(svmax,_s64_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.umax.x2.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z16test_svmax_u8_x211svuint8x2_t11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.umax.x2.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint8x2_t test_svmax_u8_x2(svuint8x2_t zdn, svuint8x2_t zm) { + return SVE_ACLE_FUNC(svmax,_u8_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.umax.x2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_u16_x212svuint16x2_t12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.umax.x2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint16x2_t test_svmax_u16_x2(svuint16x2_t zdn, svuint16x2_t zm) { + return SVE_ACLE_FUNC(svmax,_u16_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.umax.x2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_u32_x212svuint32x2_t12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.umax.x2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint32x2_t test_svmax_u32_x2(svuint32x2_t zdn, svuint32x2_t zm) { + return SVE_ACLE_FUNC(svmax,_u32_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.umax.x2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_u64_x212svuint64x2_t12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.umax.x2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint64x2_t test_svmax_u64_x2(svuint64x2_t zdn, svuint64x2_t zm) { + return SVE_ACLE_FUNC(svmax,_u64_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmax.x2.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_f16_x213svfloat16x2_t13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmax.x2.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat16x2_t test_svmax_f16_x2(svfloat16x2_t zdn, svfloat16x2_t zm) { + return SVE_ACLE_FUNC(svmax,_f16_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmax.x2.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_f32_x213svfloat32x2_t13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmax.x2.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat32x2_t test_svmax_f32_x2(svfloat32x2_t zdn, svfloat32x2_t zm) { + return SVE_ACLE_FUNC(svmax,_f32_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmax.x2.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_f64_x213svfloat64x2_t13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmax.x2.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat64x2_t test_svmax_f64_x2(svfloat64x2_t zdn, svfloat64x2_t zm) { + return SVE_ACLE_FUNC(svmax,_f64_x2)(zdn, zm); +} + +// Multi, x4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.smax.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z16test_svmax_s8_x410svint8x4_t10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.smax.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint8x4_t test_svmax_s8_x4(svint8x4_t zdn, svint8x4_t zm) { + return SVE_ACLE_FUNC(svmax,_s8_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.smax.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_s16_x411svint16x4_t11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.smax.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint16x4_t test_svmax_s16_x4(svint16x4_t zdn, svint16x4_t zm) { + return SVE_ACLE_FUNC(svmax,_s16_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.smax.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_s32_x411svint32x4_t11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.smax.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint32x4_t test_svmax_s32_x4(svint32x4_t zdn, svint32x4_t zm) { + return SVE_ACLE_FUNC(svmax,_s32_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.smax.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_s64_x411svint64x4_t11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.smax.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint64x4_t test_svmax_s64_x4(svint64x4_t zdn, svint64x4_t zm) { + return SVE_ACLE_FUNC(svmax,_s64_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.umax.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z16test_svmax_u8_x411svuint8x4_t11svuint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.umax.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svuint8x4_t test_svmax_u8_x4(svuint8x4_t zdn, svuint8x4_t zm) { + return SVE_ACLE_FUNC(svmax,_u8_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.umax.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_u16_x412svuint16x4_t12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.umax.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svuint16x4_t test_svmax_u16_x4(svuint16x4_t zdn, svuint16x4_t zm) { + return SVE_ACLE_FUNC(svmax,_u16_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.umax.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_u32_x412svuint32x4_t12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.umax.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svuint32x4_t test_svmax_u32_x4(svuint32x4_t zdn, svuint32x4_t zm) { + return SVE_ACLE_FUNC(svmax,_u32_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.umax.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_u64_x412svuint64x4_t12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.umax.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svuint64x4_t test_svmax_u64_x4(svuint64x4_t zdn, svuint64x4_t zm) { + return SVE_ACLE_FUNC(svmax,_u64_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmax.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 8) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP12]], [[TMP13]], i64 16) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP14]], [[TMP15]], i64 24) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_f16_x413svfloat16x4_t13svfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmax.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 8) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP12]], [[TMP13]], i64 16) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP14]], [[TMP15]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat16x4_t test_svmax_f16_x4(svfloat16x4_t zdn, svfloat16x4_t zm) { + return SVE_ACLE_FUNC(svmax,_f16_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmax.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 4) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP12]], [[TMP13]], i64 8) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP14]], [[TMP15]], i64 12) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_f32_x413svfloat32x4_t13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmax.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 4) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP12]], [[TMP13]], i64 8) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP14]], [[TMP15]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat32x4_t test_svmax_f32_x4(svfloat32x4_t zdn, svfloat32x4_t zm) { + return SVE_ACLE_FUNC(svmax,_f32_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmax.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 2) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP12]], [[TMP13]], i64 4) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP14]], [[TMP15]], i64 6) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_f64_x413svfloat64x4_t13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmax.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 2) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP12]], [[TMP13]], i64 4) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP14]], [[TMP15]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat64x4_t test_svmax_f64_x4(svfloat64x4_t zdn, svfloat64x4_t zm) { + return SVE_ACLE_FUNC(svmax,_f64_x4)(zdn, zm); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_maxnm.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_maxnm.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_maxnm.c @@ -0,0 +1,456 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// Single, x2 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmaxnm_single_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmaxnm.single.x2.nxv8f16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z26test_svmaxnm_single_f16_x213svfloat16x2_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmaxnm.single.x2.nxv8f16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat16x2_t test_svmaxnm_single_f16_x2(svfloat16x2_t zdn, svfloat16_t zm) { + return SVE_ACLE_FUNC(svmaxnm,_single_f16_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmaxnm_single_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmaxnm.single.x2.nxv4f32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z26test_svmaxnm_single_f32_x213svfloat32x2_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmaxnm.single.x2.nxv4f32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat32x2_t test_svmaxnm_single_f32_x2(svfloat32x2_t zdn, svfloat32_t zm) { + return SVE_ACLE_FUNC(svmaxnm,_single_f32_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmaxnm_single_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmaxnm.single.x2.nxv2f64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z26test_svmaxnm_single_f64_x213svfloat64x2_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmaxnm.single.x2.nxv2f64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat64x2_t test_svmaxnm_single_f64_x2(svfloat64x2_t zdn, svfloat64_t zm) { + return SVE_ACLE_FUNC(svmaxnm,_single_f64_x2,,,)(zdn, zm); +} + +// Single, x4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmaxnm_single_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmaxnm.single.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z26test_svmaxnm_single_f16_x413svfloat16x4_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmaxnm.single.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat16x4_t test_svmaxnm_single_f16_x4(svfloat16x4_t zdn, svfloat16_t zm) { + return SVE_ACLE_FUNC(svmaxnm,_single_f16_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmaxnm_single_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmaxnm.single.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z26test_svmaxnm_single_f32_x413svfloat32x4_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmaxnm.single.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat32x4_t test_svmaxnm_single_f32_x4(svfloat32x4_t zdn, svfloat32_t zm) { + return SVE_ACLE_FUNC(svmaxnm,_single_f32_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmaxnm_single_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmaxnm.single.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z26test_svmaxnm_single_f64_x413svfloat64x4_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmaxnm.single.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat64x4_t test_svmaxnm_single_f64_x4(svfloat64x4_t zdn, svfloat64_t zm) { + return SVE_ACLE_FUNC(svmaxnm,_single_f64_x4,,,)(zdn, zm); +} + +// Multi, x2 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmaxnm_multi_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmaxnm.x2.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z25test_svmaxnm_multi_f16_x213svfloat16x2_t13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmaxnm.x2.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat16x2_t test_svmaxnm_multi_f16_x2(svfloat16x2_t zdn, svfloat16x2_t zm) { + return SVE_ACLE_FUNC(svmaxnm,_f16_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmaxnm_multi_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmaxnm.x2.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z25test_svmaxnm_multi_f32_x213svfloat32x2_t13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmaxnm.x2.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat32x2_t test_svmaxnm_multi_f32_x2(svfloat32x2_t zdn, svfloat32x2_t zm) { + return SVE_ACLE_FUNC(svmaxnm,_f32_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmaxnm_multi_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmaxnm.x2.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z25test_svmaxnm_multi_f64_x213svfloat64x2_t13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmaxnm.x2.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat64x2_t test_svmaxnm_multi_f64_x2(svfloat64x2_t zdn, svfloat64x2_t zm) { + return SVE_ACLE_FUNC(svmaxnm,_f64_x2,,,)(zdn, zm); +} + +// Multi, x4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmaxnm_multi_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmaxnm.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 8) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP12]], [[TMP13]], i64 16) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP14]], [[TMP15]], i64 24) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z25test_svmaxnm_multi_f16_x413svfloat16x4_t13svfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmaxnm.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 8) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP12]], [[TMP13]], i64 16) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP14]], [[TMP15]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat16x4_t test_svmaxnm_multi_f16_x4(svfloat16x4_t zdn, svfloat16x4_t zm) { + return SVE_ACLE_FUNC(svmaxnm,_f16_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmaxnm_multi_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmaxnm.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 4) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP12]], [[TMP13]], i64 8) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP14]], [[TMP15]], i64 12) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z25test_svmaxnm_multi_f32_x413svfloat32x4_t13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmaxnm.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 4) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP12]], [[TMP13]], i64 8) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP14]], [[TMP15]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat32x4_t test_svmaxnm_multi_f32_x4(svfloat32x4_t zdn, svfloat32x4_t zm) { + return SVE_ACLE_FUNC(svmaxnm,_f32_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmaxnm_multi_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmaxnm.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 2) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP12]], [[TMP13]], i64 4) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP14]], [[TMP15]], i64 6) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z25test_svmaxnm_multi_f64_x413svfloat64x4_t13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmaxnm.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 2) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP12]], [[TMP13]], i64 4) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP14]], [[TMP15]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat64x4_t test_svmaxnm_multi_f64_x4(svfloat64x4_t zdn, svfloat64x4_t zm) { + return SVE_ACLE_FUNC(svmaxnm,_f64_x4,,,)(zdn, zm); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_min.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_min.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_min.c @@ -0,0 +1,1607 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED) A1 +#else +#define SVE_ACLE_FUNC(A1,A2) A1##A2 +#endif + +// Single, x2 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.smin.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z23test_svmin_single_s8_x210svint8x2_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.smin.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint8x2_t test_svmin_single_s8_x2(svint8x2_t zdn, svint8_t zm) { + return SVE_ACLE_FUNC(svmin,_single_s8_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.smin.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_s16_x211svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.smin.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint16x2_t test_svmin_single_s16_x2(svint16x2_t zdn, svint16_t zm) { + return SVE_ACLE_FUNC(svmin,_single_s16_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.smin.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_s32_x211svint32x2_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.smin.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint32x2_t test_svmin_single_s32_x2(svint32x2_t zdn, svint32_t zm) { + return SVE_ACLE_FUNC(svmin,_single_s32_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.smin.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_s64_x211svint64x2_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.smin.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint64x2_t test_svmin_single_s64_x2(svint64x2_t zdn, svint64_t zm) { + return SVE_ACLE_FUNC(svmin,_single_s64_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.umin.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z23test_svmin_single_u8_x211svuint8x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.umin.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint8x2_t test_svmin_single_u8_x2(svuint8x2_t zdn, svuint8_t zm) { + return SVE_ACLE_FUNC(svmin,_single_u8_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.umin.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_u16_x212svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.umin.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint16x2_t test_svmin_single_u16_x2(svuint16x2_t zdn, svuint16_t zm) { + return SVE_ACLE_FUNC(svmin,_single_u16_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.umin.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_u32_x212svuint32x2_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.umin.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint32x2_t test_svmin_single_u32_x2(svuint32x2_t zdn, svuint32_t zm) { + return SVE_ACLE_FUNC(svmin,_single_u32_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.umin.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_u64_x212svuint64x2_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.umin.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint64x2_t test_svmin_single_u64_x2(svuint64x2_t zdn, svuint64_t zm) { + return SVE_ACLE_FUNC(svmin,_single_u64_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmin.single.x2.nxv8f16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_f16_x213svfloat16x2_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmin.single.x2.nxv8f16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat16x2_t test_svmin_single_f16_x2(svfloat16x2_t zdn, svfloat16_t zm) { + return SVE_ACLE_FUNC(svmin,_single_f16_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmin.single.x2.nxv4f32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_f32_x213svfloat32x2_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmin.single.x2.nxv4f32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat32x2_t test_svmin_single_f32_x2(svfloat32x2_t zdn, svfloat32_t zm) { + return SVE_ACLE_FUNC(svmin,_single_f32_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmin.single.x2.nxv2f64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_f64_x213svfloat64x2_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmin.single.x2.nxv2f64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat64x2_t test_svmin_single_f64_x2(svfloat64x2_t zdn, svfloat64_t zm) { + return SVE_ACLE_FUNC(svmin,_single_f64_x2)(zdn, zm); +} + +// Single, x4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.smin.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z23test_svmin_single_s8_x410svint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.smin.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint8x4_t test_svmin_single_s8_x4(svint8x4_t zdn, svint8_t zm) { + return SVE_ACLE_FUNC(svmin,_single_s8_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.smin.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_s16_x411svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.smin.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint16x4_t test_svmin_single_s16_x4(svint16x4_t zdn, svint16_t zm) { + return SVE_ACLE_FUNC(svmin,_single_s16_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.smin.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_s32_x411svint32x4_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.smin.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint32x4_t test_svmin_single_s32_x4(svint32x4_t zdn, svint32_t zm) { + return SVE_ACLE_FUNC(svmin,_single_s32_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.smin.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_s64_x411svint64x4_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.smin.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint64x4_t test_svmin_single_s64_x4(svint64x4_t zdn, svint64_t zm) { + return SVE_ACLE_FUNC(svmin,_single_s64_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.umin.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z23test_svmin_single_u8_x411svuint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.umin.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint8x4_t test_svmin_single_u8_x4(svuint8x4_t zdn, svuint8_t zm) { + return SVE_ACLE_FUNC(svmin,_single_u8_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.umin.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_u16_x412svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.umin.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint16x4_t test_svmin_single_u16_x4(svuint16x4_t zdn, svuint16_t zm) { + return SVE_ACLE_FUNC(svmin,_single_u16_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.umin.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_u32_x412svuint32x4_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.umin.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint32x4_t test_svmin_single_u32_x4(svuint32x4_t zdn, svuint32_t zm) { + return SVE_ACLE_FUNC(svmin,_single_u32_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.umin.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_u64_x412svuint64x4_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.umin.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint64x4_t test_svmin_single_u64_x4(svuint64x4_t zdn, svuint64_t zm) { + return SVE_ACLE_FUNC(svmin,_single_u64_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmin.single.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_f16_x413svfloat16x4_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmin.single.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat16x4_t test_svmin_single_f16_x4(svfloat16x4_t zdn, svfloat16_t zm) { + return SVE_ACLE_FUNC(svmin,_single_f16_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmin.single.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_f32_x413svfloat32x4_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmin.single.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat32x4_t test_svmin_single_f32_x4(svfloat32x4_t zdn, svfloat32_t zm) { + return SVE_ACLE_FUNC(svmin,_single_f32_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmin.single.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_f64_x413svfloat64x4_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmin.single.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat64x4_t test_svmin_single_f64_x4(svfloat64x4_t zdn, svfloat64_t zm) { + return SVE_ACLE_FUNC(svmin,_single_f64_x4)(zdn, zm); +} + +// Multi, x2 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.smin.x2.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z16test_svmin_s8_x210svint8x2_t10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.smin.x2.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint8x2_t test_svmin_s8_x2(svint8x2_t zdn, svint8x2_t zm) { + return SVE_ACLE_FUNC(svmin,_s8_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.smin.x2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_s16_x211svint16x2_t11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.smin.x2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint16x2_t test_svmin_s16_x2(svint16x2_t zdn, svint16x2_t zm) { + return SVE_ACLE_FUNC(svmin,_s16_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.smin.x2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_s32_x211svint32x2_t11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.smin.x2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint32x2_t test_svmin_s32_x2(svint32x2_t zdn, svint32x2_t zm) { + return SVE_ACLE_FUNC(svmin,_s32_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.smin.x2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_s64_x211svint64x2_t11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.smin.x2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint64x2_t test_svmin_s64_x2(svint64x2_t zdn, svint64x2_t zm) { + return SVE_ACLE_FUNC(svmin,_s64_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.umin.x2.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z16test_svmin_u8_x211svuint8x2_t11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.umin.x2.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint8x2_t test_svmin_u8_x2(svuint8x2_t zdn, svuint8x2_t zm) { + return SVE_ACLE_FUNC(svmin,_u8_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.umin.x2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_u16_x212svuint16x2_t12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.umin.x2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint16x2_t test_svmin_u16_x2(svuint16x2_t zdn, svuint16x2_t zm) { + return SVE_ACLE_FUNC(svmin,_u16_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.umin.x2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_u32_x212svuint32x2_t12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.umin.x2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint32x2_t test_svmin_u32_x2(svuint32x2_t zdn, svuint32x2_t zm) { + return SVE_ACLE_FUNC(svmin,_u32_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.umin.x2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_u64_x212svuint64x2_t12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.umin.x2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint64x2_t test_svmin_u64_x2(svuint64x2_t zdn, svuint64x2_t zm) { + return SVE_ACLE_FUNC(svmin,_u64_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmin.x2.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_f16_x213svfloat16x2_t13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmin.x2.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat16x2_t test_svmin_f16_x2(svfloat16x2_t zdn, svfloat16x2_t zm) { + return SVE_ACLE_FUNC(svmin,_f16_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmin.x2.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_f32_x213svfloat32x2_t13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmin.x2.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat32x2_t test_svmin_f32_x2(svfloat32x2_t zdn, svfloat32x2_t zm) { + return SVE_ACLE_FUNC(svmin,_f32_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmin.x2.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_f64_x213svfloat64x2_t13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmin.x2.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat64x2_t test_svmin_f64_x2(svfloat64x2_t zdn, svfloat64x2_t zm) { + return SVE_ACLE_FUNC(svmin,_f64_x2)(zdn, zm); +} + +// Multi, x4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.smin.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z16test_svmin_s8_x410svint8x4_t10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.smin.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint8x4_t test_svmin_s8_x4(svint8x4_t zdn, svint8x4_t zm) { + return SVE_ACLE_FUNC(svmin,_s8_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.smin.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_s16_x411svint16x4_t11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.smin.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint16x4_t test_svmin_s16_x4(svint16x4_t zdn, svint16x4_t zm) { + return SVE_ACLE_FUNC(svmin,_s16_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.smin.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_s32_x411svint32x4_t11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.smin.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint32x4_t test_svmin_s32_x4(svint32x4_t zdn, svint32x4_t zm) { + return SVE_ACLE_FUNC(svmin,_s32_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.smin.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_s64_x411svint64x4_t11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.smin.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint64x4_t test_svmin_s64_x4(svint64x4_t zdn, svint64x4_t zm) { + return SVE_ACLE_FUNC(svmin,_s64_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.umin.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z16test_svmin_u8_x411svuint8x4_t11svuint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.umin.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svuint8x4_t test_svmin_u8_x4(svuint8x4_t zdn, svuint8x4_t zm) { + return SVE_ACLE_FUNC(svmin,_u8_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.umin.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_u16_x412svuint16x4_t12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.umin.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svuint16x4_t test_svmin_u16_x4(svuint16x4_t zdn, svuint16x4_t zm) { + return SVE_ACLE_FUNC(svmin,_u16_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.umin.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_u32_x412svuint32x4_t12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.umin.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svuint32x4_t test_svmin_u32_x4(svuint32x4_t zdn, svuint32x4_t zm) { + return SVE_ACLE_FUNC(svmin,_u32_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.umin.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_u64_x412svuint64x4_t12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.umin.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svuint64x4_t test_svmin_u64_x4(svuint64x4_t zdn, svuint64x4_t zm) { + return SVE_ACLE_FUNC(svmin,_u64_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmin.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 8) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP12]], [[TMP13]], i64 16) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP14]], [[TMP15]], i64 24) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_f16_x413svfloat16x4_t13svfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmin.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 8) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP12]], [[TMP13]], i64 16) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP14]], [[TMP15]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat16x4_t test_svmin_f16_x4(svfloat16x4_t zdn, svfloat16x4_t zm) { + return SVE_ACLE_FUNC(svmin,_f16_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmin.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 4) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP12]], [[TMP13]], i64 8) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP14]], [[TMP15]], i64 12) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_f32_x413svfloat32x4_t13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmin.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 4) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP12]], [[TMP13]], i64 8) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP14]], [[TMP15]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat32x4_t test_svmin_f32_x4(svfloat32x4_t zdn, svfloat32x4_t zm) { + return SVE_ACLE_FUNC(svmin,_f32_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmin.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 2) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP12]], [[TMP13]], i64 4) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP14]], [[TMP15]], i64 6) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_f64_x413svfloat64x4_t13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmin.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 2) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP12]], [[TMP13]], i64 4) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP14]], [[TMP15]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat64x4_t test_svmin_f64_x4(svfloat64x4_t zdn, svfloat64x4_t zm) { + return SVE_ACLE_FUNC(svmin,_f64_x4)(zdn, zm); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_minnm.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_minnm.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_minnm.c @@ -0,0 +1,456 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// Single, x2 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svminnm_single_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fminnm.single.x2.nxv8f16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z26test_svminnm_single_f16_x213svfloat16x2_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fminnm.single.x2.nxv8f16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat16x2_t test_svminnm_single_f16_x2(svfloat16x2_t zdn, svfloat16_t zm) { + return SVE_ACLE_FUNC(svminnm,_single_f16_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svminnm_single_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fminnm.single.x2.nxv4f32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z26test_svminnm_single_f32_x213svfloat32x2_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fminnm.single.x2.nxv4f32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat32x2_t test_svminnm_single_f32_x2(svfloat32x2_t zdn, svfloat32_t zm) { + return SVE_ACLE_FUNC(svminnm,_single_f32_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svminnm_single_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fminnm.single.x2.nxv2f64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z26test_svminnm_single_f64_x213svfloat64x2_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fminnm.single.x2.nxv2f64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat64x2_t test_svminnm_single_f64_x2(svfloat64x2_t zdn, svfloat64_t zm) { + return SVE_ACLE_FUNC(svminnm,_single_f64_x2,,,)(zdn, zm); +} + +// Single, x4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svminnm_single_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fminnm.single.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z26test_svminnm_single_f16_x413svfloat16x4_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fminnm.single.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat16x4_t test_svminnm_single_f16_x4(svfloat16x4_t zdn, svfloat16_t zm) { + return SVE_ACLE_FUNC(svminnm,_single_f16_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svminnm_single_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fminnm.single.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z26test_svminnm_single_f32_x413svfloat32x4_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fminnm.single.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat32x4_t test_svminnm_single_f32_x4(svfloat32x4_t zdn, svfloat32_t zm) { + return SVE_ACLE_FUNC(svminnm,_single_f32_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svminnm_single_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fminnm.single.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z26test_svminnm_single_f64_x413svfloat64x4_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fminnm.single.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat64x4_t test_svminnm_single_f64_x4(svfloat64x4_t zdn, svfloat64_t zm) { + return SVE_ACLE_FUNC(svminnm,_single_f64_x4,,,)(zdn, zm); +} + +// Multi, x2 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svminnm_multi_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fminnm.x2.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z25test_svminnm_multi_f16_x213svfloat16x2_t13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fminnm.x2.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat16x2_t test_svminnm_multi_f16_x2(svfloat16x2_t zdn, svfloat16x2_t zm) { + return SVE_ACLE_FUNC(svminnm,_f16_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svminnm_multi_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fminnm.x2.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z25test_svminnm_multi_f32_x213svfloat32x2_t13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fminnm.x2.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat32x2_t test_svminnm_multi_f32_x2(svfloat32x2_t zdn, svfloat32x2_t zm) { + return SVE_ACLE_FUNC(svminnm,_f32_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svminnm_multi_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fminnm.x2.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z25test_svminnm_multi_f64_x213svfloat64x2_t13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fminnm.x2.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat64x2_t test_svminnm_multi_f64_x2(svfloat64x2_t zdn, svfloat64x2_t zm) { + return SVE_ACLE_FUNC(svminnm,_f64_x2,,,)(zdn, zm); +} + +// Multi, x4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svminnm_multi_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fminnm.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 8) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP12]], [[TMP13]], i64 16) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP14]], [[TMP15]], i64 24) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z25test_svminnm_multi_f16_x413svfloat16x4_t13svfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fminnm.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 8) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP12]], [[TMP13]], i64 16) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP14]], [[TMP15]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat16x4_t test_svminnm_multi_f16_x4(svfloat16x4_t zdn, svfloat16x4_t zm) { + return SVE_ACLE_FUNC(svminnm,_f16_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svminnm_multi_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fminnm.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 4) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP12]], [[TMP13]], i64 8) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP14]], [[TMP15]], i64 12) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z25test_svminnm_multi_f32_x413svfloat32x4_t13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fminnm.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 4) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP12]], [[TMP13]], i64 8) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP14]], [[TMP15]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat32x4_t test_svminnm_multi_f32_x4(svfloat32x4_t zdn, svfloat32x4_t zm) { + return SVE_ACLE_FUNC(svminnm,_f32_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svminnm_multi_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fminnm.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 2) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP12]], [[TMP13]], i64 4) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP14]], [[TMP15]], i64 6) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z25test_svminnm_multi_f64_x413svfloat64x4_t13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fminnm.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 2) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP12]], [[TMP13]], i64 4) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP14]], [[TMP15]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat64x4_t test_svminnm_multi_f64_x4(svfloat64x4_t zdn, svfloat64x4_t zm) { + return SVE_ACLE_FUNC(svminnm,_f64_x4,,,)(zdn, zm); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c @@ -0,0 +1,334 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// +// Multi, multi +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla2_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmla2_f32j13svfloat32x2_t13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32x2_t zm) { + SVE_ACLE_FUNC(svmla,,_za32,_f32,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla4_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv4f32(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmla4_f32j13svfloat32x4_t13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv4f32(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32x4_t zm) { + SVE_ACLE_FUNC(svmla,,_za32,_f32,_vg1x4)(slice_base, 7, zn, zm); +} + +// +// Multi, single +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single2_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv4f32(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmla_single2_f32j13svfloat32x2_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv4f32(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) { + SVE_ACLE_FUNC(svmla,_single,_za32,_f32,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single4_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmla_single4_f32j13svfloat32x4_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) { + SVE_ACLE_FUNC(svmla,_single,_za32,_f32,_vg1x4)(slice_base, 7, zn, zm); +} + +// +// Multi, indexed +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane2_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv4f32(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmla_lane2_f32j13svfloat32x2_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv4f32(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) { + SVE_ACLE_FUNC(svmla_lane,,_za32,_f32,_vg1x2)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane4_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmla_lane4_f32j13svfloat32x4_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) { + SVE_ACLE_FUNC(svmla_lane,,_za32,_f32,_vg1x4)(slice_base, 7, zn, zm, 3); +} + +// +// Multi, multi +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla2_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmla2_f64j13svfloat64x2_t13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64x2_t zm) { + SVE_ACLE_FUNC(svmla,,_za64,_f64,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla4_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv2f64(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmla4_f64j13svfloat64x4_t13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv2f64(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64x4_t zm) { + SVE_ACLE_FUNC(svmla,,_za64,_f64,_vg1x4)(slice_base, 7, zn, zm); +} + +// +// Multi, single +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single2_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv2f64(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmla_single2_f64j13svfloat64x2_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv2f64(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) { + SVE_ACLE_FUNC(svmla,_single,_za64,_f64,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single4_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmla_single4_f64j13svfloat64x4_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) { + SVE_ACLE_FUNC(svmla,_single,_za64,_f64,_vg1x4)(slice_base, 7, zn, zm); +} + +// +// Multi, indexed +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane2_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv2f64(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 1) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmla_lane2_f64j13svfloat64x2_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv2f64(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 1) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) { + SVE_ACLE_FUNC(svmla_lane,,_za64,_f64,_vg1x2)(slice_base, 7, zn, zm, 1); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane4_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 1) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmla_lane4_f64j13svfloat64x4_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 1) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) { + SVE_ACLE_FUNC(svmla_lane,,_za64,_f64,_vg1x4)(slice_base, 7, zn, zm, 1); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c @@ -0,0 +1,794 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// +// Multi, multi +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla2_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmla2_f16j13svfloat16x2_t13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16x2_t zm) +{ + SVE_ACLE_FUNC(svmla_za32,_f16,_vg2x2,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla2_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z16test_svmla2_bf16j14svbfloat16x2_t14svbfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16x2_t zm) +{ + SVE_ACLE_FUNC(svmla_za32,_bf16,_vg2x2,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.vg2x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmla2_u16j12svuint16x2_t12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.vg2x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) +{ + SVE_ACLE_FUNC(svmla_za32,_u16,_vg2x2,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.vg2x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmla2_s16j11svint16x2_t11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.vg2x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) +{ + SVE_ACLE_FUNC(svmla_za32,_s16,_vg2x2,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla4_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8f16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmla4_f16j13svfloat16x4_t13svfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8f16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16x4_t zm) +{ + SVE_ACLE_FUNC(svmla_za32,_f16,_vg2x4,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla4_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8bf16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z16test_svmla4_bf16j14svbfloat16x4_t14svbfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8bf16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16x4_t zm) +{ + SVE_ACLE_FUNC(svmla_za32,_bf16,_vg2x4,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.vg2x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmla4_u16j12svuint16x4_t12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.vg2x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) +{ + SVE_ACLE_FUNC(svmla_za32,_u16,_vg2x4,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.vg2x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmla4_s16j11svint16x4_t11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.vg2x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) +{ + SVE_ACLE_FUNC(svmla_za32,_s16,_vg2x4,,)(slice_base, 6, zn, zm); +} + +// +// Multi, single +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single1_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8f16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmla_single1_f16ju13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8f16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_f16,_vg2x1)(slice_base, 14, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single1_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8bf16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmla_single1_bf16ju14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8bf16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_bf16,_vg2x1)(slice_base, 14, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single1_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.single.vg2x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmla_single1_u16ju12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.single.vg2x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_u16,_vg2x1)(slice_base, 14, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single1_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.single.vg2x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmla_single1_s16ju11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.single.vg2x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_s16,_vg2x1)(slice_base, 14, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single2_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmla_single2_f16j13svfloat16x2_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_f16,_vg2x2)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single2_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmla_single2_bf16j14svbfloat16x2_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_bf16,_vg2x2)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.single.vg2x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmla_single2_u16j12svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.single.vg2x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_u16,_vg2x2)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.single.vg2x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmla_single2_s16j11svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.single.vg2x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_s16,_vg2x2)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single4_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmla_single4_f16j13svfloat16x4_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_f16,_vg2x4)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single4_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmla_single4_bf16j14svbfloat16x4_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_bf16,_vg2x4)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.single.vg2x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmla_single4_u16j12svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.single.vg2x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_u16,_vg2x4)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.single.vg2x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmla_single4_s16j11svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.single.vg2x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_s16,_vg2x4)(slice_base, 6, zn, zm); +} + +// +// Multi, indexed +// + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane1_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8f16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmla_lane1_f16ju13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8f16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane,,_za32,_f16,_vg2x1)(slice_base, 14, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane1_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8bf16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmla_lane1_bf16ju14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8bf16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane,,_za32,_bf16,_vg2x1)(slice_base, 14, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane1_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.lane.vg2x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmla_lane1_u16ju12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.lane.vg2x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane,,_za32,_u16,_vg2x1)(slice_base, 14, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane1_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.lane.vg2x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmla_lane1_s16ju11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.lane.vg2x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane,,_za32,_s16,_vg2x1)(slice_base, 14, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane2_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmla_lane2_f16j13svfloat16x2_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane,,_za32,_f16,_vg2x2)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane2_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmla_lane2_bf16j14svbfloat16x2_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane,,_za32,_bf16,_vg2x2)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.lane.vg2x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmla_lane2_u16j12svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.lane.vg2x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane,,_za32,_u16,_vg2x2)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.lane.vg2x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmla_lane2_s16j11svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.lane.vg2x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane,,_za32,_s16,_vg2x2)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane4_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmla_lane4_f16j13svfloat16x4_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane,,_za32,_f16,_vg2x4)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane4_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmla_lane4_bf16j14svbfloat16x4_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane,,_za32,_bf16,_vg2x4)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.lane.vg2x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmla_lane4_u16j12svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.lane.vg2x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane,,_za32,_u16,_vg2x4)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.lane.vg2x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmla_lane4_s16j11svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.lane.vg2x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane,,_za32,_s16,_vg2x4)(slice_base, 6, zn, zm, 7); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c @@ -0,0 +1,1945 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -target-feature +sme-i16i64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// +// Single x 1 +// + +// MLAL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single_x1_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za32.single.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmla_single_x1_s8ju10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za32.single.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_s8,_vg4x1)(slice_base, 12, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single_x1_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za64.single.vg4x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svmla_single_x1_s16ju11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za64.single.vg4x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za64,_s16,_vg4x1)(slice_base, 12, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_uvmlal_single_x1_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za32.single.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_uvmlal_single_x1_u8ju11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za32.single.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_uvmlal_single_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_u8,_vg4x1)(slice_base, 12, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_uvmlal_single_x1_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za64.single.vg4x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_uvmlal_single_x1_u16ju12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za64.single.vg4x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_uvmlal_single_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za64,_u16,_vg4x1)(slice_base, 12, zn, zm); +} + +// MLSL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single_x1_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za32.single.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmls_single_x1_s8ju10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za32.single.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_s8,_vg4x1)(slice_base, 12, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single_x1_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za64.single.vg4x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svmls_single_x1_s16ju11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za64.single.vg4x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za64,_s16,_vg4x1)(slice_base, 12, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_uvmlsl_single_x1_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za32.single.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_uvmlsl_single_x1_u8ju11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za32.single.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_uvmlsl_single_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_u8,_vg4x1)(slice_base, 12, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_uvmlsl_single_x1_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za64.single.vg4x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_uvmlsl_single_x1_u16ju12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za64.single.vg4x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_uvmlsl_single_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za64,_u16,_vg4x1)(slice_base, 12, zn, zm); +} + +// USMLALL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_usmlall_single_x1_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_usmlall_single_x1_s8ju11__SVUint8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_usmlall_single_x1_s8(uint32_t slice_base, svuint8_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svusmla,_single,_za32,_s8,_vg4x1)(slice_base, 12, zn, zm); +} + +// +// Single x 2 +// + +// MLAL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single_x2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za32.single.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmla_single_x2_s8j10svint8x2_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za32.single.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_s8,_vg4x2)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single_x2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za64.single.vg4x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svmla_single_x2_s16j11svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za64.single.vg4x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za64,_s16,_vg4x2)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single_x2_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za32.single.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmla_single_x2_u8j11svuint8x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za32.single.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_u8,_vg4x2)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single_x2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za64.single.vg4x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svmla_single_x2_u16j12svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za64.single.vg4x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za64,_u16,_vg4x2)(slice_base, 4, zn, zm); +} + +// MLSL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single_x2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za32.single.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmls_single_x2_s8j10svint8x2_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za32.single.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_s8,_vg4x2)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single_x2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za64.single.vg4x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svmls_single_x2_s16j11svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za64.single.vg4x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za64,_s16,_vg4x2)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single_x2_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za32.single.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmls_single_x2_u8j11svuint8x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za32.single.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_u8,_vg4x2)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single_x2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za64.single.vg4x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svmls_single_x2_u16j12svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za64.single.vg4x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za64,_u16,_vg4x2)(slice_base, 4, zn, zm); +} + +// SUMLALL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsumla_single_x2_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumla.za32.single.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsumla_single_x2_u8j10svint8x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumla.za32.single.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsumla_single_x2_u8(uint32_t slice_base, svint8x2_t zn, svuint8_t zm) +{ + SVE_ACLE_FUNC(svsumla,_single,_za32,_u8,_vg4x2)(slice_base, 4, zn, zm); +} + +// USMLALL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_usmlall_single_x2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_usmlall_single_x2_s8j11svuint8x2_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_usmlall_single_x2_s8(uint32_t slice_base, svuint8x2_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svusmla,_single,_za32,_s8,_vg4x2)(slice_base, 4, zn, zm); +} + +// +// Single x 4 +// + +// MLAL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single_x4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za32.single.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmla_single_x4_s8j10svint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za32.single.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_s8,_vg4x4)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single_x4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za64.single.vg4x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svmla_single_x4_s16j11svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za64.single.vg4x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za64,_s16,_vg4x4)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single_x4_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za32.single.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmla_single_x4_u8j11svuint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za32.single.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_u8,_vg4x4)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single_x4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za64.single.vg4x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svmla_single_x4_u16j12svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za64.single.vg4x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za64,_u16,_vg4x4)(slice_base, 4, zn, zm); +} + +// MLSL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single_x4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za32.single.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmls_single_x4_s8j10svint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za32.single.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_s8,_vg4x4)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single_x4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za64.single.vg4x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svmls_single_x4_s16j11svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za64.single.vg4x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za64,_s16,_vg4x4)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single_x4_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za32.single.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmls_single_x4_u8j11svuint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za32.single.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_u8,_vg4x4)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single_x4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za64.single.vg4x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svmls_single_x4_u16j12svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za64.single.vg4x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za64,_u16,_vg4x4)(slice_base, 4, zn, zm); +} + +// SUMLALL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsumla_single_x4_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumla.za32.single.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsumla_single_x4_u8j10svint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumla.za32.single.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsumla_single_x4_u8(uint32_t slice_base, svint8x4_t zn, svuint8_t zm) +{ + SVE_ACLE_FUNC(svsumla,_single,_za32,_u8,_vg4x4)(slice_base, 4, zn, zm); +} + +// USMLALL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_usmlall_single_x4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_usmlall_single_x4_s8j11svuint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_usmlall_single_x4_s8(uint32_t slice_base, svuint8x4_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svusmla,_single,_za32,_s8,_vg4x4)(slice_base, 4, zn, zm); +} + +// +// Multi x 2 +// + +// MLAL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_mlal_multi_x2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za32.vg4x2.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_mlal_multi_x2_s8j10svint8x2_t10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za32.vg4x2.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_mlal_multi_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8x2_t zm) +{ + SVE_ACLE_FUNC(svmla_za32,_s8,_vg4x2,,)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_mlal_multi_x2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za64.vg4x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_mlal_multi_x2_s16j11svint16x2_t11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za64.vg4x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_mlal_multi_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) +{ + SVE_ACLE_FUNC(svmla_za64,_s16,_vg4x2,,)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_mlal_multi_x2_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za32.vg4x2.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_mlal_multi_x2_u8j11svuint8x2_t11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za32.vg4x2.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_mlal_multi_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8x2_t zm) +{ + SVE_ACLE_FUNC(svmla_za32,_u8,_vg4x2,,)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_mlal_multi_x2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za64.vg4x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_mlal_multi_x2_u16j12svuint16x2_t12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za64.vg4x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_mlal_multi_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) +{ + SVE_ACLE_FUNC(svmla_za64,_u16,_vg4x2,,)(slice_base, 4, zn, zm); +} + +// MLSL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_mlsl_multi_x2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za32.vg4x2.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_mlsl_multi_x2_s8j10svint8x2_t10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za32.vg4x2.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_mlsl_multi_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8x2_t zm) +{ + SVE_ACLE_FUNC(svmls_za32,_s8,_vg4x2,,)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_mlsl_multi_x2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za64.vg4x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_mlsl_multi_x2_s16j11svint16x2_t11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za64.vg4x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_mlsl_multi_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) +{ + SVE_ACLE_FUNC(svmls_za64,_s16,_vg4x2,,)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_mlsl_multi_x2_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za32.vg4x2.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_mlsl_multi_x2_u8j11svuint8x2_t11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za32.vg4x2.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_mlsl_multi_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8x2_t zm) +{ + SVE_ACLE_FUNC(svmls_za32,_u8,_vg4x2,,)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_mlsl_multi_x2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za64.vg4x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_mlsl_multi_x2_u16j12svuint16x2_t12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za64.vg4x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_mlsl_multi_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) +{ + SVE_ACLE_FUNC(svmls_za64,_u16,_vg4x2,,)(slice_base, 4, zn, zm); +} + +// USMLALL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_usmlal_multi_x2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_usmlal_multi_x2_s8j11svuint8x2_t10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_usmlal_multi_x2_s8(uint32_t slice_base, svuint8x2_t zn, svint8x2_t zm) +{ + SVE_ACLE_FUNC(svusmla_za32,_s8,_vg4x2,,)(slice_base, 4, zn, zm); +} + +// +// Multi x 4 +// + +// MLAL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_mlal_multi_x4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za32.vg4x4.nxv16i8(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_mlal_multi_x4_s8j10svint8x4_t10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za32.vg4x4.nxv16i8(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_mlal_multi_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8x4_t zm) +{ + SVE_ACLE_FUNC(svmla_za32,_s8,_vg4x4,,)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_mlal_multi_x4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za64.vg4x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_mlal_multi_x4_s16j11svint16x4_t11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za64.vg4x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_mlal_multi_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) +{ + SVE_ACLE_FUNC(svmla_za64,_s16,_vg4x4,,)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_mlal_multi_x4_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za32.vg4x4.nxv16i8(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_mlal_multi_x4_u8j11svuint8x4_t11svuint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za32.vg4x4.nxv16i8(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_mlal_multi_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8x4_t zm) +{ + SVE_ACLE_FUNC(svmla_za32,_u8,_vg4x4,,)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_mlal_multi_x4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za64.vg4x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_mlal_multi_x4_u16j12svuint16x4_t12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za64.vg4x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_mlal_multi_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) +{ + SVE_ACLE_FUNC(svmla_za64,_u16,_vg4x4,,)(slice_base, 4, zn, zm); +} + +// MLSL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_mlsl_multi_x4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za32.vg4x4.nxv16i8(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_mlsl_multi_x4_s8j10svint8x4_t10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za32.vg4x4.nxv16i8(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_mlsl_multi_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8x4_t zm) +{ + SVE_ACLE_FUNC(svmls_za32,_s8,_vg4x4,,)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_mlsl_multi_x4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za64.vg4x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_mlsl_multi_x4_s16j11svint16x4_t11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za64.vg4x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_mlsl_multi_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) +{ + SVE_ACLE_FUNC(svmls_za64,_s16,_vg4x4,,)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_mlsl_multi_x4_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za32.vg4x4.nxv16i8(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_mlsl_multi_x4_u8j11svuint8x4_t11svuint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za32.vg4x4.nxv16i8(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_mlsl_multi_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8x4_t zm) +{ + SVE_ACLE_FUNC(svmls_za32,_u8,_vg4x4,,)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_mlsl_multi_x4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za64.vg4x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_mlsl_multi_x4_u16j12svuint16x4_t12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za64.vg4x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_mlsl_multi_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) +{ + SVE_ACLE_FUNC(svmls_za64,_u16,_vg4x4,,)(slice_base, 4, zn, zm); +} + +// USMLALL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_usmlal_multi_x4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_usmlal_multi_x4_s8j11svuint8x4_t10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_usmlal_multi_x4_s8(uint32_t slice_base, svuint8x4_t zn, svint8x4_t zm) +{ + SVE_ACLE_FUNC(svusmla_za32,_s8,_vg4x4,,)(slice_base, 4, zn, zm); +} + +// +// Indexed x 1 +// + +// SMLAL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlal_lane_x1_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_smlal_lane_x1_s8ju10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_smlal_lane_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svmla_lane_za32,_s8,_vg4x1,,)(slice_base, 12, zn, zm, 15); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlal_lane_x1_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_smlal_lane_x1_s16ju11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_smlal_lane_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane_za64,_s16,_vg4x1,,)(slice_base, 12, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlal_lane_x1_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_smlal_lane_x1_u8ju11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_smlal_lane_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) +{ + SVE_ACLE_FUNC(svmla_lane_za32,_u8,_vg4x1,,)(slice_base, 12, zn, zm, 15); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlal_lane_x1_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_smlal_lane_x1_u16ju12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_smlal_lane_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane_za64,_u16,_vg4x1,,)(slice_base, 12, zn, zm, 7); +} + +// SMLSL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlsl_lane_x1_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x1_s8ju10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_smlsl_lane_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svmls_lane_za32,_s8,_vg4x1,,)(slice_base, 12, zn, zm, 15); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlsl_lane_x1_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x1_s16ju11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_smlsl_lane_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane_za64,_s16,_vg4x1,,)(slice_base, 12, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlsl_lane_x1_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x1_u8ju11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_smlsl_lane_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) +{ + SVE_ACLE_FUNC(svmls_lane_za32,_u8,_vg4x1,,)(slice_base, 12, zn, zm, 15); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlsl_lane_x1_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x1_u16ju12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_smlsl_lane_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane_za64,_u16,_vg4x1,,)(slice_base, 12, zn, zm, 7); +} + +// SUMLALL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_sumlall_lane_x1_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_sumlall_lane_x1_s8ju10__SVInt8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_sumlall_lane_x1_s8(uint32_t slice_base, svint8_t zn, svuint8_t zm) +{ + SVE_ACLE_FUNC(svsumla_lane_za32,_s8,_vg4x1,,)(slice_base, 12, zn, zm, 15); +} + +// USMLALL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_usmlall_lane_x1_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_usmlall_lane_x1_s8ju11__SVUint8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_usmlall_lane_x1_s8(uint32_t slice_base, svuint8_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svusmla_lane_za32,_u8,_vg4x1,,)(slice_base, 12, zn, zm, 15); +} + +// +// Indexed x 2 +// + +// SMLAL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlal_lane_x2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_smlal_lane_x2_s8j10svint8x2_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_smlal_lane_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svmla_lane_za32,_s8,_vg4x2,,)(slice_base, 4, zn, zm, 15); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlal_lane_x2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_smlal_lane_x2_s16j11svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_smlal_lane_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane_za64,_s16,_vg4x2,,)(slice_base, 4, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlal_lane_x2_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_smlal_lane_x2_u8j11svuint8x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_smlal_lane_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) +{ + SVE_ACLE_FUNC(svmla_lane_za32,_u8,_vg4x2,,)(slice_base, 4, zn, zm, 15); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlal_lane_x2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_smlal_lane_x2_u16j12svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_smlal_lane_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane_za64,_u16,_vg4x2,,)(slice_base, 4, zn, zm, 7); +} + +// SMLSL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlsl_lane_x2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x2_s8j10svint8x2_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_smlsl_lane_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svmls_lane_za32,_s8,_vg4x2,,)(slice_base, 4, zn, zm, 15); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlsl_lane_x2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x2_s16j11svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_smlsl_lane_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane_za64,_s16,_vg4x2,,)(slice_base, 4, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlsl_lane_x2_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x2_u8j11svuint8x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_smlsl_lane_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) +{ + SVE_ACLE_FUNC(svmls_lane_za32,_u8,_vg4x2,,)(slice_base, 4, zn, zm, 15); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlsl_lane_x2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x2_u16j12svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_smlsl_lane_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane_za64,_u16,_vg4x2,,)(slice_base, 4, zn, zm, 7); +} + +// SUMLALL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_sumlall_lane_x2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_sumlall_lane_x2_s8j10svint8x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_sumlall_lane_x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8_t zm) +{ + SVE_ACLE_FUNC(svsumla_lane_za32,_s8,_vg4x2,,)(slice_base, 4, zn, zm, 15); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_usmlall_lane_x2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_usmlall_lane_x2_s8j11svuint8x2_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_usmlall_lane_x2_s8(uint32_t slice_base, svuint8x2_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svusmla_lane_za32,_u8,_vg4x2,,)(slice_base, 4, zn, zm, 15); +} + +// +// Indexed x 4 +// + +// MLAL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlal_lane_x4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_smlal_lane_x4_s8j10svint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_smlal_lane_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svmla_lane_za32,_s8,_vg4x4,,)(slice_base, 4, zn, zm, 15); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlal_lane_x4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_smlal_lane_x4_s16j11svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_smlal_lane_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane_za64,_s16,_vg4x4,,)(slice_base, 4, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlal_lane_x4_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_smlal_lane_x4_u8j11svuint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_smlal_lane_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) +{ + SVE_ACLE_FUNC(svmla_lane_za32,_u8,_vg4x4,,)(slice_base, 4, zn, zm, 15); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlal_lane_x4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_smlal_lane_x4_u16j12svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_smlal_lane_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane_za64,_u16,_vg4x4,,)(slice_base, 4, zn, zm, 7); +} + +// MLSL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlsl_lane_x4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x4_s8j10svint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_smlsl_lane_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svmls_lane_za32,_s8,_vg4x4,,)(slice_base, 4, zn, zm, 15); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlsl_lane_x4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x4_s16j11svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_smlsl_lane_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane_za64,_s16,_vg4x4,,)(slice_base, 4, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlsl_lane_x4_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x4_u8j11svuint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_smlsl_lane_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) +{ + SVE_ACLE_FUNC(svmls_lane_za32,_u8,_vg4x4,,)(slice_base, 4, zn, zm, 15); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlsl_lane_x4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x4_u16j12svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_smlsl_lane_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane_za64,_u16,_vg4x4,,)(slice_base, 4, zn, zm, 7); +} + +// SUMLALL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_sumlall_lane_x4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_sumlall_lane_x4_s8j10svint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_sumlall_lane_x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8_t zm) +{ + SVE_ACLE_FUNC(svsumla_lane_za32,_s8,_vg4x4,,)(slice_base, 4, zn, zm, 15); +} + +// USMLALL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_usmlall_lane_x4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_usmlall_lane_x4_s8j11svuint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_usmlall_lane_x4_s8(uint32_t slice_base, svuint8x4_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svusmla_lane_za32,_u8,_vg4x4,,)(slice_base, 4, zn, zm, 15); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c @@ -0,0 +1,334 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// +// Multi, multi +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls2_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmls2_f32j13svfloat32x2_t13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32x2_t zm) { + SVE_ACLE_FUNC(svmls,,_za32,_f32,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls4_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv4f32(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmls4_f32j13svfloat32x4_t13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv4f32(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32x4_t zm) { + SVE_ACLE_FUNC(svmls,,_za32,_f32,_vg1x4)(slice_base, 7, zn, zm); +} + +// +// Multi, single +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single2_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv4f32(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmls_single2_f32j13svfloat32x2_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv4f32(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) { + SVE_ACLE_FUNC(svmls,_single,_za32,_f32,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single4_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmls_single4_f32j13svfloat32x4_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) { + SVE_ACLE_FUNC(svmls,_single,_za32,_f32,_vg1x4)(slice_base, 7, zn, zm); +} + +// +// Multi, indexed +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane2_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv4f32(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmls_lane2_f32j13svfloat32x2_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv4f32(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) { + SVE_ACLE_FUNC(svmls_lane,,_za32,_f32,_vg1x2)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane4_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmls_lane4_f32j13svfloat32x4_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) { + SVE_ACLE_FUNC(svmls_lane,,_za32,_f32,_vg1x4)(slice_base, 7, zn, zm, 3); +} + +// +// Multi, multi +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls2_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmls2_f64j13svfloat64x2_t13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64x2_t zm) { + SVE_ACLE_FUNC(svmls,,_za64,_f64,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls4_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv2f64(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmls4_f64j13svfloat64x4_t13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv2f64(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64x4_t zm) { + SVE_ACLE_FUNC(svmls,,_za64,_f64,_vg1x4)(slice_base, 7, zn, zm); +} + +// +// Multi, single +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single2_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv2f64(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmls_single2_f64j13svfloat64x2_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv2f64(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) { + SVE_ACLE_FUNC(svmls,_single,_za64,_f64,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single4_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmls_single4_f64j13svfloat64x4_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) { + SVE_ACLE_FUNC(svmls,_single,_za64,_f64,_vg1x4)(slice_base, 7, zn, zm); +} + +// +// Multi, indexed +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane2_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv2f64(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 1) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmls_lane2_f64j13svfloat64x2_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv2f64(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 1) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) { + SVE_ACLE_FUNC(svmls_lane,,_za64,_f64,_vg1x2)(slice_base, 7, zn, zm, 1); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane4_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 1) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmls_lane4_f64j13svfloat64x4_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 1) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) { + SVE_ACLE_FUNC(svmls_lane,,_za64,_f64,_vg1x4)(slice_base, 7, zn, zm, 1); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c @@ -0,0 +1,794 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// +// Multi, multi +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls2_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmls2_f16j13svfloat16x2_t13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16x2_t zm) +{ + SVE_ACLE_FUNC(svmls_za32,_f16,_vg2x2,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls2_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z16test_svmls2_bf16j14svbfloat16x2_t14svbfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16x2_t zm) +{ + SVE_ACLE_FUNC(svmls_za32,_bf16,_vg2x2,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.vg2x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmls2_u16j12svuint16x2_t12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.vg2x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) +{ + SVE_ACLE_FUNC(svmls_za32,_u16,_vg2x2,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.vg2x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmls2_s16j11svint16x2_t11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.vg2x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) +{ + SVE_ACLE_FUNC(svmls_za32,_s16,_vg2x2,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls4_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8f16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmls4_f16j13svfloat16x4_t13svfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8f16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16x4_t zm) +{ + SVE_ACLE_FUNC(svmls_za32,_f16,_vg2x4,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls4_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8bf16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z16test_svmls4_bf16j14svbfloat16x4_t14svbfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8bf16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16x4_t zm) +{ + SVE_ACLE_FUNC(svmls_za32,_bf16,_vg2x4,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.vg2x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmls4_u16j12svuint16x4_t12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.vg2x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) +{ + SVE_ACLE_FUNC(svmls_za32,_u16,_vg2x4,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.vg2x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmls4_s16j11svint16x4_t11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.vg2x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) +{ + SVE_ACLE_FUNC(svmls_za32,_s16,_vg2x4,,)(slice_base, 6, zn, zm); +} + +// +// Multi, single +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single1_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8f16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmls_single1_f16ju13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8f16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_f16,_vg2x1)(slice_base, 14, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single1_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8bf16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmls_single1_bf16ju14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8bf16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_bf16,_vg2x1)(slice_base, 14, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single1_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.single.vg2x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmls_single1_u16ju12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.single.vg2x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_u16,_vg2x1)(slice_base, 14, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single1_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.single.vg2x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmls_single1_s16ju11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.single.vg2x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_s16,_vg2x1)(slice_base, 14, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single2_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmls_single2_f16j13svfloat16x2_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_f16,_vg2x2)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single2_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmls_single2_bf16j14svbfloat16x2_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_bf16,_vg2x2)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.single.vg2x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmls_single2_u16j12svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.single.vg2x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_u16,_vg2x2)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.single.vg2x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmls_single2_s16j11svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.single.vg2x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_s16,_vg2x2)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single4_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmls_single4_f16j13svfloat16x4_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_f16,_vg2x4)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single4_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmls_single4_bf16j14svbfloat16x4_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_bf16,_vg2x4)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.single.vg2x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmls_single4_u16j12svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.single.vg2x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_u16,_vg2x4)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.single.vg2x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmls_single4_s16j11svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.single.vg2x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_s16,_vg2x4)(slice_base, 6, zn, zm); +} + +// +// Multi, indexed +// + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane1_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8f16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmls_lane1_f16ju13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8f16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane,,_za32,_f16,_vg2x1)(slice_base, 14, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane1_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8bf16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmls_lane1_bf16ju14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8bf16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane,,_za32,_bf16,_vg2x1)(slice_base, 14, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane1_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.lane.vg2x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmls_lane1_u16ju12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.lane.vg2x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane,,_za32,_u16,_vg2x1)(slice_base, 14, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane1_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.lane.vg2x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmls_lane1_s16ju11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.lane.vg2x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane,,_za32,_s16,_vg2x1)(slice_base, 14, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane2_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmls_lane2_f16j13svfloat16x2_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane,,_za32,_f16,_vg2x2)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane2_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmls_lane2_bf16j14svbfloat16x2_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane,,_za32,_bf16,_vg2x2)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.lane.vg2x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmls_lane2_u16j12svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.lane.vg2x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane,,_za32,_u16,_vg2x2)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.lane.vg2x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmls_lane2_s16j11svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.lane.vg2x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane,,_za32,_s16,_vg2x2)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane4_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmls_lane4_f16j13svfloat16x4_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane,,_za32,_f16,_vg2x4)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane4_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmls_lane4_bf16j14svbfloat16x4_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane,,_za32,_bf16,_vg2x4)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.lane.vg2x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmls_lane4_u16j12svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.lane.vg2x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane,,_za32,_u16,_vg2x4)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.lane.vg2x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmls_lane4_s16j11svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.lane.vg2x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane,,_za32,_s16,_vg2x4)(slice_base, 6, zn, zm, 7); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mop.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mop.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mop.c @@ -0,0 +1,140 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// MOPA + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmopa_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smopa.za32.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmopa_s16u10__SVBool_tu10__SVBool_tu11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smopa.za32.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmopa_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) { + SVE_ACLE_FUNC(svmopa_za32,_s16,_m,)(3, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmopa_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umopa.za32.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmopa_u16u10__SVBool_tu10__SVBool_tu12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umopa.za32.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmopa_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) { + SVE_ACLE_FUNC(svmopa_za32,_u16,_m,)(3, pn, pm, zn, zm); +} + +// MOPS + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmops_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smops.za32.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmops_s16u10__SVBool_tu10__SVBool_tu11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smops.za32.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmops_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) { + SVE_ACLE_FUNC(svmops_za32,_s16,_m,)(3, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmops_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umops.za32.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmops_u16u10__SVBool_tu10__SVBool_tu12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umops.za32.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmops_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) { + SVE_ACLE_FUNC(svmops_za32,_u16,_m,)(3, pn, pm, zn, zm); +} + +// BMOPA + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svbmopa_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.bmopa.za32.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z16test_svbmopa_u16u10__SVBool_tu10__SVBool_tu12__SVUint32_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.bmopa.za32.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svbmopa_u16(svbool_t pn, svbool_t pm, svuint32_t zn, svuint32_t zm) { + SVE_ACLE_FUNC(svbmopa_za32,_u32,_m,)(3, pn, pm, zn, zm); +} + +// BMOPS + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svbmops_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.bmops.za32.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z16test_svbmops_u16u10__SVBool_tu10__SVBool_tu12__SVUint32_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.bmops.za32.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svbmops_u16(svbool_t pn, svbool_t pm, svuint32_t zn, svuint32_t zm) { + SVE_ACLE_FUNC(svbmops_za32,_u32,_m,)(3, pn, pm, zn, zm); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c @@ -0,0 +1,1547 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za8_u8_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 14 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 0, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za8_u8_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 14 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint8x2_t test_svread_ver_za8_u8_vg2(uint32_t base) { + return svread_ver_za8_u8_vg2(0, base, 14); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za8_s8_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 14 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 0, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za8_s8_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 14 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint8x2_t test_svread_ver_za8_s8_vg2(uint32_t base) { + return svread_ver_za8_s8_vg2(0, base, 14); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za8_u8_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 14 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 0, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za8_u8_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 14 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint8x2_t test_svread_hor_za8_u8_vg2(uint32_t base) { + return svread_hor_za8_u8_vg2(0, base, 14); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za8_s8_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 14 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 0, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za8_s8_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 14 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint8x2_t test_svread_hor_za8_s8_vg2(uint32_t base) { + return svread_hor_za8_s8_vg2(0, base, 14); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za8_u8_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 12 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 0, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za8_u8_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 12 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint8x4_t test_svread_hor_za8_u8_vg4(uint32_t base) { + return svread_hor_za8_u8_vg4(0, base, 12); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za8_s8_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 12 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 0, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za8_s8_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 12 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint8x4_t test_svread_hor_za8_s8_vg4(uint32_t base) { + return svread_hor_za8_s8_vg4(0, base, 12); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za8_u8_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 12 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 0, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za8_u8_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 12 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint8x4_t test_svread_ver_za8_u8_vg4(uint32_t base) { + return svread_ver_za8_u8_vg4(0, base, 12); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za8_s8_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 12 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 0, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za8_s8_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 12 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint8x4_t test_svread_ver_za8_s8_vg4(uint32_t base) { + return svread_ver_za8_s8_vg4(0, base, 12); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_u16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za16_u16_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint16x2_t test_svread_hor_za16_u16_vg2(uint32_t base) { + return svread_hor_za16_u16_vg2(1, base, 6); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_bf16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv8bf16(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z29test_svread_hor_za16_bf16_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv8bf16(i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svbfloat16x2_t test_svread_hor_za16_bf16_vg2(uint32_t base) { + return svread_hor_za16_bf16_vg2(1, base, 6); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_f16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv8f16(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za16_f16_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv8f16(i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svfloat16x2_t test_svread_hor_za16_f16_vg2(uint32_t base) { + return svread_hor_za16_f16_vg2(1, base, 6); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_s16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za16_s16_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint16x2_t test_svread_hor_za16_s16_vg2(uint32_t base) { + return svread_hor_za16_s16_vg2(1, base, 6); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_u16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za16_u16_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint16x2_t test_svread_ver_za16_u16_vg2(uint32_t base) { + return svread_ver_za16_u16_vg2(1, base, 6); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_bf16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv8bf16(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z29test_svread_ver_za16_bf16_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv8bf16(i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svbfloat16x2_t test_svread_ver_za16_bf16_vg2(uint32_t base) { + return svread_ver_za16_bf16_vg2(1, base, 6); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_f16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv8f16(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za16_f16_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv8f16(i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svfloat16x2_t test_svread_ver_za16_f16_vg2(uint32_t base) { + return svread_ver_za16_f16_vg2(1, base, 6); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_s16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za16_s16_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint16x2_t test_svread_ver_za16_s16_vg2(uint32_t base) { + return svread_ver_za16_s16_vg2(1, base, 6); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_u16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za16_u16_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint16x4_t test_svread_hor_za16_u16_vg4(uint32_t base) { + return svread_hor_za16_u16_vg4(1, base, 4); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_bf16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv8bf16(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z29test_svread_hor_za16_bf16_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv8bf16(i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svbfloat16x4_t test_svread_hor_za16_bf16_vg4(uint32_t base) { + return svread_hor_za16_bf16_vg4(1, base, 4); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_f16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv8f16(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za16_f16_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv8f16(i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svfloat16x4_t test_svread_hor_za16_f16_vg4(uint32_t base) { + return svread_hor_za16_f16_vg4(1, base, 4); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_s16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za16_s16_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint16x4_t test_svread_hor_za16_s16_vg4(uint32_t base) { + return svread_hor_za16_s16_vg4(1, base, 4); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_u16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za16_u16_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint16x4_t test_svread_ver_za16_u16_vg4(uint32_t base) { + return svread_ver_za16_u16_vg4(1, base, 4); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_bf16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv8bf16(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z29test_svread_ver_za16_bf16_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv8bf16(i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svbfloat16x4_t test_svread_ver_za16_bf16_vg4(uint32_t base) { + return svread_ver_za16_bf16_vg4(1, base, 4); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_f16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv8f16(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za16_f16_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv8f16(i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svfloat16x4_t test_svread_ver_za16_f16_vg4(uint32_t base) { + return svread_ver_za16_f16_vg4(1, base, 4); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_s16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za16_s16_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint16x4_t test_svread_ver_za16_s16_vg4(uint32_t base) { + return svread_ver_za16_s16_vg4(1, base, 4); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za32_u32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32 3, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za32_u32_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32 3, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint32x2_t test_svread_hor_za32_u32_vg2(uint32_t base) { + return svread_hor_za32_u32_vg2(3, base, 2); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za32_f32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv4f32(i32 3, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za32_f32_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv4f32(i32 3, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svfloat32x2_t test_svread_hor_za32_f32_vg2(uint32_t base) { + return svread_hor_za32_f32_vg2(3, base, 2); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za32_s32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32 3, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za32_s32_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32 3, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint32x2_t test_svread_hor_za32_s32_vg2(uint32_t base) { + return svread_hor_za32_s32_vg2(3, base, 2); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za32_u32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32 3, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za32_u32_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32 3, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint32x2_t test_svread_ver_za32_u32_vg2(uint32_t base) { + return svread_ver_za32_u32_vg2(3, base, 2); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za32_f32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv4f32(i32 3, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za32_f32_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv4f32(i32 3, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svfloat32x2_t test_svread_ver_za32_f32_vg2(uint32_t base) { + return svread_ver_za32_f32_vg2(3, base, 2); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za32_s32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32 3, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za32_s32_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32 3, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint32x2_t test_svread_ver_za32_s32_vg2(uint32_t base) { + return svread_ver_za32_s32_vg2(3, base, 2); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za32_u32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za32_u32_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint32x4_t test_svread_hor_za32_u32_vg4(uint32_t base) { + return svread_hor_za32_u32_vg4(3, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za32_f32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv4f32(i32 3, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za32_f32_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv4f32(i32 3, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat32x4_t test_svread_hor_za32_f32_vg4(uint32_t base) { + return svread_hor_za32_f32_vg4(3, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za32_s32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za32_s32_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint32x4_t test_svread_hor_za32_s32_vg4(uint32_t base) { + return svread_hor_za32_s32_vg4(3, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za32_u32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za32_u32_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint32x4_t test_svread_ver_za32_u32_vg4(uint32_t base) { + return svread_ver_za32_u32_vg4(3, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za32_f32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv4f32(i32 3, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za32_f32_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv4f32(i32 3, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat32x4_t test_svread_ver_za32_f32_vg4(uint32_t base) { + return svread_ver_za32_f32_vg4(3, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za32_s32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za32_s32_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint32x4_t test_svread_ver_za32_s32_vg4(uint32_t base) { + return svread_ver_za32_s32_vg4(3, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za64_u64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za64_u64_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint64x2_t test_svread_hor_za64_u64_vg2(uint32_t base) { + return svread_hor_za64_u64_vg2(7, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za64_f64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv2f64(i32 7, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za64_f64_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv2f64(i32 7, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat64x2_t test_svread_hor_za64_f64_vg2(uint32_t base) { + return svread_hor_za64_f64_vg2(7, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za64_s64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za64_s64_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint64x2_t test_svread_hor_za64_s64_vg2(uint32_t base) { + return svread_hor_za64_s64_vg2(7, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za64_u64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za64_u64_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint64x2_t test_svread_ver_za64_u64_vg2(uint32_t base) { + return svread_ver_za64_u64_vg2(7, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za64_f64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv2f64(i32 7, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za64_f64_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv2f64(i32 7, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat64x2_t test_svread_ver_za64_f64_vg2(uint32_t base) { + return svread_ver_za64_f64_vg2(7, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za64_s64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za64_s64_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint64x2_t test_svread_ver_za64_s64_vg2(uint32_t base) { + return svread_ver_za64_s64_vg2(7, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za64_u64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za64_u64_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint64x4_t test_svread_hor_za64_u64_vg4(uint32_t base) { + return svread_hor_za64_u64_vg4(7, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za64_f64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv2f64(i32 7, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za64_f64_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv2f64(i32 7, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat64x4_t test_svread_hor_za64_f64_vg4(uint32_t base) { + return svread_hor_za64_f64_vg4(7, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za64_s64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za64_s64_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint64x4_t test_svread_hor_za64_s64_vg4(uint32_t base) { + return svread_hor_za64_s64_vg4(7, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za64_u64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za64_u64_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint64x4_t test_svread_ver_za64_u64_vg4(uint32_t base) { + return svread_ver_za64_u64_vg4(7, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za64_f64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv2f64(i32 7, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za64_f64_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv2f64(i32 7, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat64x4_t test_svread_ver_za64_f64_vg4(uint32_t base) { + return svread_ver_za64_f64_vg4(7, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za64_s64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za64_s64_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint64x4_t test_svread_ver_za64_s64_vg4(uint32_t base) { + return svread_ver_za64_s64_vg4(7, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_za64_u64_vg1x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_za64_u64_vg1x2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint64x2_t test_svread_za64_u64_vg1x2(uint32_t base) { + return svread_za64_u64_vg1x2(base, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_za64_f64_vg1x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.vg1x2.nxv2f64(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_za64_f64_vg1x2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.vg1x2.nxv2f64(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svfloat64x2_t test_svread_za64_f64_vg1x2(uint32_t base) { + return svread_za64_f64_vg1x2(base, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_za64_s64_vg1x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_za64_s64_vg1x2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint64x2_t test_svread_za64_s64_vg1x2(uint32_t base) { + return svread_za64_s64_vg1x2(base, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_za64_u64_vg1x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP7]], [[TMP8]], i64 6) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_za64_u64_vg1x4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP7]], [[TMP8]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint64x4_t test_svread_za64_u64_vg1x4(uint32_t base) { + return svread_za64_u64_vg1x4(base, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_za64_f64_vg1x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.vg1x4.nxv2f64(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP5]], [[TMP6]], i64 4) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP7]], [[TMP8]], i64 6) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_za64_f64_vg1x4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.vg1x4.nxv2f64(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP5]], [[TMP6]], i64 4) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP7]], [[TMP8]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svfloat64x4_t test_svread_za64_f64_vg1x4(uint32_t base) { + return svread_za64_f64_vg1x4(base, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_za64_s64_vg1x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP7]], [[TMP8]], i64 6) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_za64_s64_vg1x4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP7]], [[TMP8]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint64x4_t test_svread_za64_s64_vg1x4(uint32_t base) { + return svread_za64_s64_vg1x4(base, 7); +} + diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c @@ -0,0 +1,47 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin.§ +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svreinterpret_svbool_svcnt( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount") [[CNT:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z31test_svreinterpret_svbool_svcntu11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount") [[CNT:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbool_t test_svreinterpret_svbool_svcnt(svcount_t cnt) +{ + return SVE_ACLE_FUNC(svreinterpret,_b,,)(cnt); +} + +// CHECK-LABEL: @test_svreinterpret_svcnt_svbool( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt( [[PG:%.*]]) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z31test_svreinterpret_svcnt_svboolu10__SVBool_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt( [[PG:%.*]]) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svreinterpret_svcnt_svbool(svbool_t pg) +{ + return SVE_ACLE_FUNC(svreinterpret,_c,,)(pg); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_rshl.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_rshl.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_rshl.c @@ -0,0 +1,1176 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// Single, x2 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_single_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.srshl.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svrshl_single_s8_x210svint8x2_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.srshl.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint8x2_t test_svrshl_single_s8_x2(svint8x2_t zdn, svint8_t zm) { + return SVE_ACLE_FUNC(svrshl,_single_s8_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_single_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.srshl.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z25test_svrshl_single_s16_x211svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.srshl.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint16x2_t test_svrshl_single_s16_x2(svint16x2_t zdn, svint16_t zm) { + return SVE_ACLE_FUNC(svrshl,_single_s16_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_single_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.srshl.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z25test_svrshl_single_s32_x211svint32x2_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.srshl.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint32x2_t test_svrshl_single_s32_x2(svint32x2_t zdn, svint32_t zm) { + return SVE_ACLE_FUNC(svrshl,_single_s32_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_single_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.srshl.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z25test_svrshl_single_s64_x211svint64x2_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.srshl.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint64x2_t test_svrshl_single_s64_x2(svint64x2_t zdn, svint64_t zm) { + return SVE_ACLE_FUNC(svrshl,_single_s64_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_single_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.urshl.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svrshl_single_u8_x211svuint8x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.urshl.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint8x2_t test_svrshl_single_u8_x2(svuint8x2_t zdn, svuint8_t zm) { + return SVE_ACLE_FUNC(svrshl,_single_u8_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_single_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.urshl.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z25test_svrshl_single_u16_x212svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.urshl.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint16x2_t test_svrshl_single_u16_x2(svuint16x2_t zdn, svuint16_t zm) { + return SVE_ACLE_FUNC(svrshl,_single_u16_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_single_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.urshl.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z25test_svrshl_single_u32_x212svuint32x2_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.urshl.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint32x2_t test_svrshl_single_u32_x2(svuint32x2_t zdn, svuint32_t zm) { + return SVE_ACLE_FUNC(svrshl,_single_u32_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_single_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.urshl.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z25test_svrshl_single_u64_x212svuint64x2_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.urshl.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint64x2_t test_svrshl_single_u64_x2(svuint64x2_t zdn, svuint64_t zm) { + return SVE_ACLE_FUNC(svrshl,_single_u64_x2,,,)(zdn, zm); +} + +// Single, x4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_single_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.srshl.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svrshl_single_s8_x410svint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.srshl.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint8x4_t test_svrshl_single_s8_x4(svint8x4_t zdn, svint8_t zm) { + return SVE_ACLE_FUNC(svrshl,_single_s8_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_single_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.srshl.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z25test_svrshl_single_s16_x411svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.srshl.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint16x4_t test_svrshl_single_s16_x4(svint16x4_t zdn, svint16_t zm) { + return SVE_ACLE_FUNC(svrshl,_single_s16_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_single_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.srshl.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z25test_svrshl_single_s32_x411svint32x4_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.srshl.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint32x4_t test_svrshl_single_s32_x4(svint32x4_t zdn, svint32_t zm) { + return SVE_ACLE_FUNC(svrshl,_single_s32_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_single_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.srshl.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z25test_svrshl_single_s64_x411svint64x4_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.srshl.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint64x4_t test_svrshl_single_s64_x4(svint64x4_t zdn, svint64_t zm) { + return SVE_ACLE_FUNC(svrshl,_single_s64_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_single_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.urshl.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svrshl_single_u8_x411svuint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.urshl.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint8x4_t test_svrshl_single_u8_x4(svuint8x4_t zdn, svuint8_t zm) { + return SVE_ACLE_FUNC(svrshl,_single_u8_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_single_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.urshl.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z25test_svrshl_single_u16_x412svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.urshl.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint16x4_t test_svrshl_single_u16_x4(svuint16x4_t zdn, svuint16_t zm) { + return SVE_ACLE_FUNC(svrshl,_single_u16_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_single_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.urshl.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z25test_svrshl_single_u32_x412svuint32x4_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.urshl.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint32x4_t test_svrshl_single_u32_x4(svuint32x4_t zdn, svuint32_t zm) { + return SVE_ACLE_FUNC(svrshl,_single_u32_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_single_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.urshl.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z25test_svrshl_single_u64_x412svuint64x4_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.urshl.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint64x4_t test_svrshl_single_u64_x4(svuint64x4_t zdn, svuint64_t zm) { + return SVE_ACLE_FUNC(svrshl,_single_u64_x4,,,)(zdn, zm); +} + +// Multi, x2 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_multi_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.srshl.x2.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z23test_svrshl_multi_s8_x210svint8x2_t10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.srshl.x2.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint8x2_t test_svrshl_multi_s8_x2(svint8x2_t zdn, svint8x2_t zm) { + return SVE_ACLE_FUNC(svrshl,_s8_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_multi_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.srshl.x2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z24test_svrshl_multi_s16_x211svint16x2_t11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.srshl.x2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint16x2_t test_svrshl_multi_s16_x2(svint16x2_t zdn, svint16x2_t zm) { + return SVE_ACLE_FUNC(svrshl,_s16_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_multi_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.srshl.x2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z24test_svrshl_multi_s32_x211svint32x2_t11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.srshl.x2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint32x2_t test_svrshl_multi_s32_x2(svint32x2_t zdn, svint32x2_t zm) { + return SVE_ACLE_FUNC(svrshl,_s32_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_multi_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.srshl.x2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z24test_svrshl_multi_s64_x211svint64x2_t11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.srshl.x2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint64x2_t test_svrshl_multi_s64_x2(svint64x2_t zdn, svint64x2_t zm) { + return SVE_ACLE_FUNC(svrshl,_s64_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_multi_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.urshl.x2.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z23test_svrshl_multi_u8_x211svuint8x2_t11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.urshl.x2.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint8x2_t test_svrshl_multi_u8_x2(svuint8x2_t zdn, svuint8x2_t zm) { + return SVE_ACLE_FUNC(svrshl,_u8_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_multi_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.urshl.x2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z24test_svrshl_multi_u16_x212svuint16x2_t12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.urshl.x2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint16x2_t test_svrshl_multi_u16_x2(svuint16x2_t zdn, svuint16x2_t zm) { + return SVE_ACLE_FUNC(svrshl,_u16_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_multi_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.urshl.x2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z24test_svrshl_multi_u32_x212svuint32x2_t12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.urshl.x2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint32x2_t test_svrshl_multi_u32_x2(svuint32x2_t zdn, svuint32x2_t zm) { + return SVE_ACLE_FUNC(svrshl,_u32_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_multi_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.urshl.x2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z24test_svrshl_multi_u64_x212svuint64x2_t12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.urshl.x2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint64x2_t test_svrshl_multi_u64_x2(svuint64x2_t zdn, svuint64x2_t zm) { + return SVE_ACLE_FUNC(svrshl,_u64_x2,,,)(zdn, zm); +} + +// Multi, x4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_multi_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.srshl.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z23test_svrshl_multi_s8_x410svint8x4_t10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.srshl.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint8x4_t test_svrshl_multi_s8_x4(svint8x4_t zdn, svint8x4_t zm) { + return SVE_ACLE_FUNC(svrshl,_s8_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_multi_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.srshl.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z24test_svrshl_multi_s16_x411svint16x4_t11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.srshl.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint16x4_t test_svrshl_multi_s16_x4(svint16x4_t zdn, svint16x4_t zm) { + return SVE_ACLE_FUNC(svrshl,_s16_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_multi_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.srshl.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z24test_svrshl_multi_s32_x411svint32x4_t11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.srshl.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint32x4_t test_svrshl_multi_s32_x4(svint32x4_t zdn, svint32x4_t zm) { + return SVE_ACLE_FUNC(svrshl,_s32_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_multi_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.srshl.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z24test_svrshl_multi_s64_x411svint64x4_t11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.srshl.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint64x4_t test_svrshl_multi_s64_x4(svint64x4_t zdn, svint64x4_t zm) { + return SVE_ACLE_FUNC(svrshl,_s64_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_multi_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.urshl.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z23test_svrshl_multi_u8_x411svuint8x4_t11svuint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.urshl.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svuint8x4_t test_svrshl_multi_u8_x4(svuint8x4_t zdn, svuint8x4_t zm) { + return SVE_ACLE_FUNC(svrshl,_u8_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_multi_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.urshl.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z24test_svrshl_multi_u16_x412svuint16x4_t12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.urshl.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svuint16x4_t test_svrshl_multi_u16_x4(svuint16x4_t zdn, svuint16x4_t zm) { + return SVE_ACLE_FUNC(svrshl,_u16_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_multi_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.urshl.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z24test_svrshl_multi_u32_x412svuint32x4_t12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.urshl.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svuint32x4_t test_svrshl_multi_u32_x4(svuint32x4_t zdn, svuint32x4_t zm) { + return SVE_ACLE_FUNC(svrshl,_u32_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_multi_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.urshl.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z24test_svrshl_multi_u64_x412svuint64x4_t12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.urshl.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svuint64x4_t test_svrshl_multi_u64_x4(svuint64x4_t zdn, svuint64x4_t zm) { + return SVE_ACLE_FUNC(svrshl,_u64_x4,,,)(zdn, zm); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_set2-bool.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_set2-bool.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_set2-bool.c @@ -0,0 +1,47 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK + +// REQUIRES: aarch64-registered-target + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svset2_b8_0( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TUPLE:%.*]], [[X:%.*]], i64 0) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svset2_b8_010svboolx2_tu10__SVBool_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TUPLE:%.*]], [[X:%.*]], i64 0) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svboolx2_t test_svset2_b8_0(svboolx2_t tuple, svbool_t x) +{ + return SVE_ACLE_FUNC(svset2,_b8,,)(tuple, 0, x); +} + +// CHECK-LABEL: @test_svset2_b8_1( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TUPLE:%.*]], [[X:%.*]], i64 16) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svset2_b8_110svboolx2_tu10__SVBool_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TUPLE:%.*]], [[X:%.*]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svboolx2_t test_svset2_b8_1(svboolx2_t tuple, svbool_t x) +{ + return SVE_ACLE_FUNC(svset2,_b8,,)(tuple, 1, x); +} + diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_set4-bool.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_set4-bool.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_set4-bool.c @@ -0,0 +1,62 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK + +// REQUIRES: aarch64-registered-target + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + + +// CHECK-LABEL: @test_svset4_b8_0( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TUPLE:%.*]], [[X:%.*]], i64 0) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svset4_b8_010svboolx4_tu10__SVBool_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TUPLE:%.*]], [[X:%.*]], i64 0) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svboolx4_t test_svset4_b8_0(svboolx4_t tuple, svbool_t x) +{ + return SVE_ACLE_FUNC(svset4,_b8,,)(tuple, 0, x); +} + +// CHECK-LABEL: @test_svset4_b8_1( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TUPLE:%.*]], [[X:%.*]], i64 16) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svset4_b8_110svboolx4_tu10__SVBool_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TUPLE:%.*]], [[X:%.*]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svboolx4_t test_svset4_b8_1(svboolx4_t tuple, svbool_t x) +{ + return SVE_ACLE_FUNC(svset4,_b8,,)(tuple, 1, x); +} + +// CHECK-LABEL: @test_svset4_b8_3( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TUPLE:%.*]], [[X:%.*]], i64 48) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svset4_b8_310svboolx4_tu10__SVBool_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TUPLE:%.*]], [[X:%.*]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svboolx4_t test_svset4_b8_3(svboolx4_t tuple, svbool_t x) +{ + return SVE_ACLE_FUNC(svset4,_b8,,)(tuple, 3, x); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sqdmulh.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sqdmulh.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sqdmulh.c @@ -0,0 +1,600 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// Single, x2 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqdmulh_single_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.sqdmulh.single.vgx2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z27test_svsqdmulh_single_s8_x210svint8x2_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.sqdmulh.single.vgx2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint8x2_t test_svsqdmulh_single_s8_x2(svint8x2_t zdn, svint8_t zm) { + return SVE_ACLE_FUNC(svsqdmulh,_single_s8_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqdmulh_single_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.sqdmulh.single.vgx2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z28test_svsqdmulh_single_s16_x211svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.sqdmulh.single.vgx2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint16x2_t test_svsqdmulh_single_s16_x2(svint16x2_t zdn, svint16_t zm) { + return SVE_ACLE_FUNC(svsqdmulh,_single_s16_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqdmulh_single_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.sqdmulh.single.vgx2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z28test_svsqdmulh_single_s32_x211svint32x2_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.sqdmulh.single.vgx2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint32x2_t test_svsqdmulh_single_s32_x2(svint32x2_t zdn, svint32_t zm) { + return SVE_ACLE_FUNC(svsqdmulh,_single_s32_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqdmulh_single_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.sqdmulh.single.vgx2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z28test_svsqdmulh_single_s64_x211svint64x2_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.sqdmulh.single.vgx2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint64x2_t test_svsqdmulh_single_s64_x2(svint64x2_t zdn, svint64_t zm) { + return SVE_ACLE_FUNC(svsqdmulh,_single_s64_x2,,,)(zdn, zm); +} + +// Single, x4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqdmulh_single_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.sqdmulh.single.vgx4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z27test_svsqdmulh_single_s8_x410svint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.sqdmulh.single.vgx4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint8x4_t test_svsqdmulh_single_s8_x4(svint8x4_t zdn, svint8_t zm) { + return SVE_ACLE_FUNC(svsqdmulh,_single_s8_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqdmulh_single_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.sqdmulh.single.vgx4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z28test_svsqdmulh_single_s16_x411svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.sqdmulh.single.vgx4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint16x4_t test_svsqdmulh_single_s16_x4(svint16x4_t zdn, svint16_t zm) { + return SVE_ACLE_FUNC(svsqdmulh,_single_s16_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqdmulh_single_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.sqdmulh.single.vgx4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z28test_svsqdmulh_single_s32_x411svint32x4_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.sqdmulh.single.vgx4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint32x4_t test_svsqdmulh_single_s32_x4(svint32x4_t zdn, svint32_t zm) { + return SVE_ACLE_FUNC(svsqdmulh,_single_s32_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqdmulh_single_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.sqdmulh.single.vgx4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z28test_svsqdmulh_single_s64_x411svint64x4_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.sqdmulh.single.vgx4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint64x4_t test_svsqdmulh_single_s64_x4(svint64x4_t zdn, svint64_t zm) { + return SVE_ACLE_FUNC(svsqdmulh,_single_s64_x4,,,)(zdn, zm); +} + +// Multi, x2 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqdmulh_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sqdmulh.vgx2.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z20test_svsqdmulh_s8_x210svint8x2_t10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sqdmulh.vgx2.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint8x2_t test_svsqdmulh_s8_x2(svint8x2_t zdn, svint8x2_t zm) { + return SVE_ACLE_FUNC(svsqdmulh,_s8_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqdmulh_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sqdmulh.vgx2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z21test_svsqdmulh_s16_x211svint16x2_t11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sqdmulh.vgx2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint16x2_t test_svsqdmulh_s16_x2(svint16x2_t zdn, svint16x2_t zm) { + return SVE_ACLE_FUNC(svsqdmulh,_s16_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqdmulh_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sqdmulh.vgx2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z21test_svsqdmulh_s32_x211svint32x2_t11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sqdmulh.vgx2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint32x2_t test_svsqdmulh_s32_x2(svint32x2_t zdn, svint32x2_t zm) { + return SVE_ACLE_FUNC(svsqdmulh,_s32_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqdmulh_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sqdmulh.vgx2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z21test_svsqdmulh_s64_x211svint64x2_t11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sqdmulh.vgx2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint64x2_t test_svsqdmulh_s64_x2(svint64x2_t zdn, svint64x2_t zm) { + return SVE_ACLE_FUNC(svsqdmulh,_s64_x2,,,)(zdn, zm); +} + +// Multi, x4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqdmulh_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sqdmulh.vgx4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z20test_svsqdmulh_s8_x410svint8x4_t10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sqdmulh.vgx4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint8x4_t test_svsqdmulh_s8_x4(svint8x4_t zdn, svint8x4_t zm) { + return SVE_ACLE_FUNC(svsqdmulh,_s8_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqdmulh_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sqdmulh.vgx4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z21test_svsqdmulh_s16_x411svint16x4_t11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sqdmulh.vgx4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint16x4_t test_svsqdmulh_s16_x4(svint16x4_t zdn, svint16x4_t zm) { + return SVE_ACLE_FUNC(svsqdmulh,_s16_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqdmulh_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sqdmulh.vgx4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z21test_svsqdmulh_s32_x411svint32x4_t11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sqdmulh.vgx4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint32x4_t test_svsqdmulh_s32_x4(svint32x4_t zdn, svint32x4_t zm) { + return SVE_ACLE_FUNC(svsqdmulh,_s32_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqdmulh_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sqdmulh.vgx4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z21test_svsqdmulh_s64_x411svint64x4_t11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sqdmulh.vgx4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint64x4_t test_svsqdmulh_s64_x4(svint64x4_t zdn, svint64x4_t zm) { + return SVE_ACLE_FUNC(svsqdmulh,_s64_x4,,,)(zdn, zm); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c @@ -0,0 +1,434 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// +// Single-Multi +// + +// x2 +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsub_write_single2_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv4i32(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svsub_write_single2_u32j12svuint32x2_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv4i32(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_single2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32_t zm) { + SVE_ACLE_FUNC(svsub_write,_single,_za32,_u32,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsub_write_single2_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svsub_write_single2_u64j12svuint64x2_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_single2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64_t zm) { + SVE_ACLE_FUNC(svsub_write,_single,_za64,_u64,_vg1x2)(slice_base, 7, zn, zm); +} + +// x4 + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsub_write_single4_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svsub_write_single4_u32j12svuint32x4_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_single4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32_t zm) { + SVE_ACLE_FUNC(svsub_write,_single,_za32,_u32,_vg1x4)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsub_write_single4_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svsub_write_single4_u64j12svuint64x4_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_single4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64_t zm) { + SVE_ACLE_FUNC(svsub_write,_single,_za64,_u64,_vg1x4)(slice_base, 7, zn, zm); +} + +// +// Multi-Multi +// + +// x2 + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsub_write_multi2_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi2_u32j12svuint32x2_t12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_multi2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32x2_t zm) { + SVE_ACLE_FUNC(svsub_write,,_za32,_u32,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsub_write_multi2_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi2_u64j12svuint64x2_t12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_multi2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64x2_t zm) { + SVE_ACLE_FUNC(svsub_write,,_za64,_u64,_vg1x2)(slice_base, 7, zn, zm); +} + +// x4 + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsub_write_multi4_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv4i32(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi4_u32j12svuint32x4_t12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv4i32(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_multi4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32x4_t zm) { + SVE_ACLE_FUNC(svsub_write,,_za32,_u32,_vg1x4)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsub_write_multi4_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv2i64(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi4_u64j12svuint64x4_t12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv2i64(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_multi4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64x4_t zm) { + SVE_ACLE_FUNC(svsub_write,,_za64,_u64,_vg1x4)(slice_base, 7, zn, zm); +} + +// +// Accumulate to ZA +// + +// x2 + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsub_za32_vg1x2_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4f32(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za32_vg1x2_f32j13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4f32(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za32_vg1x2_f32(uint32_t slice_base, svfloat32x2_t zn) { + SVE_ACLE_FUNC(svsub_za32,,_f32,,_vg1x2)(slice_base, 6, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsub_za32_vg1x2_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4i32(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za32_vg1x2_u32j12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4i32(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za32_vg1x2_u32(uint32_t slice_base, svuint32x2_t zn) { + SVE_ACLE_FUNC(svsub_za32,,_u32,,_vg1x2)(slice_base, 6, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsub_za64_vg1x2_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2f64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za64_vg1x2_f64j13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2f64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za64_vg1x2_f64(uint32_t slice_base, svfloat64x2_t zn) { + SVE_ACLE_FUNC(svsub_za64,,_f64,,_vg1x2)(slice_base, 6, zn); +} + + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsub_za64_vg1x2_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za64_vg1x2_u64j12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za64_vg1x2_u64(uint32_t slice_base, svuint64x2_t zn) { + SVE_ACLE_FUNC(svsub_za64,,_u64,,_vg1x2)(slice_base, 6, zn); +} + +// x4 + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsub_za32_vg1x4_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za32_vg1x4_f32j13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za32_vg1x4_f32(uint32_t slice_base, svfloat32x4_t zn) { + SVE_ACLE_FUNC(svsub_za32,,_f32,,_vg1x4)(slice_base, 6, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsub_za32_vg1x4_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za32_vg1x4_u32j12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za32_vg1x4_u32(uint32_t slice_base, svuint32x4_t zn) { + SVE_ACLE_FUNC(svsub_za32,,_u32,,_vg1x4)(slice_base, 6, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsub_za64_vg1x4_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za64_vg1x4_f64j13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za64_vg1x4_f64(uint32_t slice_base, svfloat64x4_t zn) { + SVE_ACLE_FUNC(svsub_za64,,_f64,,_vg1x4)(slice_base, 6, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsub_za64_vg1x4_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za64_vg1x4_u64j12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za64_vg1x4_u64(uint32_t slice_base, svuint64x4_t zn) { + SVE_ACLE_FUNC(svsub_za64,,_u64,,_vg1x4)(slice_base, 6, zn); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vdot.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vdot.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vdot.c @@ -0,0 +1,252 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svvdot_lane_za32_bf16_vg1x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svvdot_lane_za32_bf16_vg1x2j14svbfloat16x2_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svvdot_lane_za32_bf16_vg1x2(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) { + SVE_ACLE_FUNC(svvdot_lane_za32,_bf16,_vg1x2)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svvdot_lane_za32_f16_vg1x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svvdot_lane_za32_f16_vg1x2j13svfloat16x2_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svvdot_lane_za32_f16_vg1x2(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) { + SVE_ACLE_FUNC(svvdot_lane_za32,_f16,_vg1x2)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svvdot_lane_za32_s16_vg1x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svvdot_lane_za32_s16_vg1x2j11svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svvdot_lane_za32_s16_vg1x2(uint32_t slice_base, svint16x2_t zn, svint16_t zm) { + SVE_ACLE_FUNC(svvdot_lane_za32,_s16,_vg1x2)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svvdot_lane_za32_u16_vg1x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svvdot_lane_za32_u16_vg1x2j12svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svvdot_lane_za32_u16_vg1x2(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) { + SVE_ACLE_FUNC(svvdot_lane_za32,_u16,_vg1x2)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svvdot_lane_za32_s8_vg1x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svvdot_lane_za32_s8_vg1x4j10svint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svvdot_lane_za32_s8_vg1x4(uint32_t slice_base, svint8x4_t zn, svint8_t zm) { + SVE_ACLE_FUNC(svvdot_lane_za32,_s8,_vg1x4)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svvdot_lane_za32_u8_vg1x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svvdot_lane_za32_u8_vg1x4j11svuint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svvdot_lane_za32_u8_vg1x4(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) { + SVE_ACLE_FUNC(svvdot_lane_za32,_u8,_vg1x4)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svvdot_lane_za64_s16_vg1x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.svdot.lane.za64.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 1) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svvdot_lane_za64_s16_vg1x4j11svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.svdot.lane.za64.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 1) +// CPP-CHECK-NEXT: ret void +// +void test_svvdot_lane_za64_s16_vg1x4(uint32_t slice_base, svint16x4_t zn, svint16_t zm) { + SVE_ACLE_FUNC(svvdot_lane_za64,_s16,_vg1x4)(slice_base, 7, zn, zm, 1); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svvdot_lane_za64_u16_vg1x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.uvdot.lane.za64.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 1) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svvdot_lane_za64_u16_vg1x4j12svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.uvdot.lane.za64.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 1) +// CPP-CHECK-NEXT: ret void +// +void test_svvdot_lane_za64_u16_vg1x4(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) { + SVE_ACLE_FUNC(svvdot_lane_za64,_u16,_vg1x4)(slice_base, 7, zn, zm, 1); +} + + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsuvdot_lane_za32_s8_vg1x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svsuvdot_lane_za32_s8_vg1x4j10svint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svsuvdot_lane_za32_s8_vg1x4(uint32_t slice_base, svint8x4_t zn, svint8_t zm) { + SVE_ACLE_FUNC(svsuvdot_lane_za32,_s8,_vg1x4)(slice_base, 7, zn, zm, 3); +} + + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svusvdot_lane_za32_u8_vg1x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svusvdot_lane_za32_u8_vg1x4j11svuint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svusvdot_lane_za32_u8_vg1x4(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) { + SVE_ACLE_FUNC(svusvdot_lane_za32,_u8,_vg1x4)(slice_base, 7, zn, zm, 3); +} + diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_write.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_write.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_write.c @@ -0,0 +1,1222 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za8_u8_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv16i8(i32 0, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za8_u8_vg2j11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv16i8(i32 0, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za8_u8_vg2(uint32_t base, svuint8x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za8,_u8,_vg2,)(0, base, 14, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za8_s8_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv16i8(i32 0, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za8_s8_vg2j10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv16i8(i32 0, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za8_s8_vg2(uint32_t base, svint8x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za8,_s8,_vg2,)(0, base, 14, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za8_u8_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32 0, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za8_u8_vg2j11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32 0, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za8_u8_vg2(uint32_t base, svuint8x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za8,_u8,_vg2,)(0, base, 14, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za8_s8_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32 0, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za8_s8_vg2j10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32 0, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za8_s8_vg2(uint32_t base, svint8x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za8,_s8,_vg2,)(0, base, 14, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za8_u8_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv16i8(i32 0, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za8_u8_vg4j11svuint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv16i8(i32 0, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za8_u8_vg4(uint32_t base, svuint8x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za8,_u8,_vg4,)(0, base, 12, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za8_s8_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv16i8(i32 0, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za8_s8_vg4j10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv16i8(i32 0, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za8_s8_vg4(uint32_t base, svint8x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za8,_s8,_vg4,)(0, base, 12, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za8_u8_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv16i8(i32 0, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za8_u8_vg4j11svuint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv16i8(i32 0, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za8_u8_vg4(uint32_t base, svuint8x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za8,_u8,_vg4,)(0, base, 12, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za8_s8_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv16i8(i32 0, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za8_s8_vg4j10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv16i8(i32 0, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za8_s8_vg4(uint32_t base, svint8x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za8,_s8,_vg4,)(0, base, 12, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_u16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv8i16(i32 1, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za16_u16_vg2j12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv8i16(i32 1, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_u16_vg2(uint32_t base, svuint16x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za16,_u16,_vg2,)(1, base, 6, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_bf16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv8bf16(i32 1, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svwrite_hor_za16_bf16_vg2j14svbfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv8bf16(i32 1, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_bf16_vg2(uint32_t base, svbfloat16x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za16,_bf16,_vg2,)(1, base, 6, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_f16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv8f16(i32 1, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za16_f16_vg2j13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv8f16(i32 1, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_f16_vg2(uint32_t base, svfloat16x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za16,_f16,_vg2,)(1, base, 6, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_s16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv8i16(i32 1, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za16_s16_vg2j11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv8i16(i32 1, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_s16_vg2(uint32_t base, svint16x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za16,_s16,_vg2,)(1, base, 6, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_u16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv8i16(i32 1, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za16_u16_vg2j12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv8i16(i32 1, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_u16_vg2(uint32_t base, svuint16x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za16,_u16,_vg2,)(1, base, 6, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_bf16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv8bf16(i32 1, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svwrite_ver_za16_bf16_vg2j14svbfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv8bf16(i32 1, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_bf16_vg2(uint32_t base, svbfloat16x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za16,_bf16,_vg2,)(1, base, 6, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_f16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv8f16(i32 1, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za16_f16_vg2j13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv8f16(i32 1, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_f16_vg2(uint32_t base, svfloat16x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za16,_f16,_vg2,)(1, base, 6, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_s16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv8i16(i32 1, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za16_s16_vg2j11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv8i16(i32 1, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_s16_vg2(uint32_t base, svint16x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za16,_s16,_vg2,)(1, base, 6, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_u16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv8i16(i32 1, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za16_u16_vg4j12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv8i16(i32 1, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_u16_vg4(uint32_t base, svuint16x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za16,_u16,_vg4,)(1, base, 4, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_bf16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv8bf16(i32 1, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svwrite_hor_za16_bf16_vg4j14svbfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv8bf16(i32 1, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_bf16_vg4(uint32_t base, svbfloat16x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za16,_bf16,_vg4,)(1, base, 4, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_f16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv8f16(i32 1, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za16_f16_vg4j13svfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv8f16(i32 1, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_f16_vg4(uint32_t base, svfloat16x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za16,_f16,_vg4,)(1, base, 4, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_s16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv8i16(i32 1, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za16_s16_vg4j11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv8i16(i32 1, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_s16_vg4(uint32_t base, svint16x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za16,_s16,_vg4,)(1, base, 4, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_u16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv8i16(i32 1, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za16_u16_vg4j12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv8i16(i32 1, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_u16_vg4(uint32_t base, svuint16x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za16,_u16,_vg4,)(1, base, 4, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_bf16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv8bf16(i32 1, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svwrite_ver_za16_bf16_vg4j14svbfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv8bf16(i32 1, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_bf16_vg4(uint32_t base, svbfloat16x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za16,_bf16,_vg4,)(1, base, 4, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_f16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv8f16(i32 1, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za16_f16_vg4j13svfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv8f16(i32 1, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_f16_vg4(uint32_t base, svfloat16x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za16,_f16,_vg4,)(1, base, 4, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_s16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv8i16(i32 1, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za16_s16_vg4j11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv8i16(i32 1, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_s16_vg4(uint32_t base, svint16x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za16,_s16,_vg4,)(1, base, 4, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za32_u32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv4i32(i32 3, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za32_u32_vg2j12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv4i32(i32 3, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za32_u32_vg2(uint32_t base, svuint32x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za32,_u32,_vg2,)(3, base, 2, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za32_f32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv4f32(i32 3, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za32_f32_vg2j13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv4f32(i32 3, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za32_f32_vg2(uint32_t base, svfloat32x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za32,_f32,_vg2,)(3, base, 2, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za32_s32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv4i32(i32 3, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za32_s32_vg2j11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv4i32(i32 3, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za32_s32_vg2(uint32_t base, svint32x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za32,_s32,_vg2,)(3, base, 2, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za32_u32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv4i32(i32 3, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za32_u32_vg2j12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv4i32(i32 3, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za32_u32_vg2(uint32_t base, svuint32x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za32,_u32,_vg2,)(3, base, 2, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za32_f32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv4f32(i32 3, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za32_f32_vg2j13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv4f32(i32 3, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za32_f32_vg2(uint32_t base, svfloat32x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za32,_f32,_vg2,)(3, base, 2, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za32_s32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv4i32(i32 3, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za32_s32_vg2j11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv4i32(i32 3, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za32_s32_vg2(uint32_t base, svint32x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za32,_s32,_vg2,)(3, base, 2, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za32_u32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za32_u32_vg4j12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za32_u32_vg4(uint32_t base, svuint32x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za32,_u32,_vg4,)(3, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za32_f32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv4f32(i32 3, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za32_f32_vg4j13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv4f32(i32 3, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za32_f32_vg4(uint32_t base, svfloat32x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za32,_f32,_vg4,)(3, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za32_s32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za32_s32_vg4j11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za32_s32_vg4(uint32_t base, svint32x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za32,_s32,_vg4,)(3, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za32_u32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za32_u32_vg4j12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za32_u32_vg4(uint32_t base, svuint32x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za32,_u32,_vg4,)(3, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za32_f32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv4f32(i32 3, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za32_f32_vg4j13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv4f32(i32 3, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za32_f32_vg4(uint32_t base, svfloat32x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za32,_f32,_vg4,)(3, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za32_s32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za32_s32_vg4j11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za32_s32_vg4(uint32_t base, svint32x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za32,_s32,_vg4,)(3, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za64_u64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za64_u64_vg2j12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za64_u64_vg2(uint32_t base, svuint64x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za64,_u64,_vg2,)(7, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za64_f64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv2f64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za64_f64_vg2j13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv2f64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za64_f64_vg2(uint32_t base, svfloat64x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za64,_f64,_vg2,)(7, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za64_s64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za64_s64_vg2j11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za64_s64_vg2(uint32_t base, svint64x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za64,_s64,_vg2,)(7, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za64_u64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za64_u64_vg2j12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za64_u64_vg2(uint32_t base, svuint64x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za64,_u64,_vg2,)(7, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za64_f64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv2f64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za64_f64_vg2j13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv2f64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za64_f64_vg2(uint32_t base, svfloat64x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za64,_f64,_vg2,)(7, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za64_s64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za64_s64_vg2j11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za64_s64_vg2(uint32_t base, svint64x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za64,_s64,_vg2,)(7, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za64_u64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za64_u64_vg4j12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za64_u64_vg4(uint32_t base, svuint64x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za64,_u64,_vg4,)(7, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za64_f64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv2f64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za64_f64_vg4j13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv2f64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za64_f64_vg4(uint32_t base, svfloat64x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za64,_f64,_vg4,)(7, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za64_s64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za64_s64_vg4j11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za64_s64_vg4(uint32_t base, svint64x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za64,_s64,_vg4,)(7, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za64_u64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za64_u64_vg4j12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za64_u64_vg4(uint32_t base, svuint64x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za64,_u64,_vg4,)(7, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za64_f64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv2f64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za64_f64_vg4j13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv2f64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za64_f64_vg4(uint32_t base, svfloat64x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za64,_f64,_vg4,)(7, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za64_s64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za64_s64_vg4j11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za64_s64_vg4(uint32_t base, svint64x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za64,_s64,_vg4,)(7, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_za64_u64_vg1x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_za64_u64_vg1x2j12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_za64_u64_vg1x2(uint32_t base, svuint64x2_t val) { + SVE_ACLE_FUNC(svwrite_za64,_u64,_vg1x2,)(base, 7, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_za64_f64_vg1x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x2.nxv2f64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_za64_f64_vg1x2j13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x2.nxv2f64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_za64_f64_vg1x2(uint32_t base, svfloat64x2_t val) { + SVE_ACLE_FUNC(svwrite_za64,_f64,_vg1x2,)(base, 7, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_za64_s64_vg1x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_za64_s64_vg1x2j11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_za64_s64_vg1x2(uint32_t base, svint64x2_t val) { + SVE_ACLE_FUNC(svwrite_za64,_s64,_vg1x2,)(base, 7, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_za64_u64_vg1x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_za64_u64_vg1x4j12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_za64_u64_vg1x4(uint32_t base, svuint64x4_t val) { + SVE_ACLE_FUNC(svwrite_za64,_u64,_vg1x4,)(base, 7, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_za64_f64_vg1x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x4.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_za64_f64_vg1x4j13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x4.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_za64_f64_vg1x4(uint32_t base, svfloat64x4_t val) { + SVE_ACLE_FUNC(svwrite_za64,_f64,_vg1x4,)(base, 7, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_za64_s64_vg1x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_za64_s64_vg1x4j11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_za64_s64_vg1x4(uint32_t base, svint64x4_t val) { + SVE_ACLE_FUNC(svwrite_za64,_s64,_vg1x4,)(base, 7, val); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_zero_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_zero_zt.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_zero_zt.c @@ -0,0 +1,33 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1) A1 +#else +#define SVE_ACLE_FUNC(A1) A1 +#endif + +// CHECK-LABEL: @test_svzero_zt( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.zero.zt(i32 0) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z14test_svzero_ztv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.zero.zt(i32 0) +// CPP-CHECK-NEXT: ret void +// +__attribute__((arm_shared_za)) +void test_svzero_zt(void) { + svzero_zt(0); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sve_add.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sve_add.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sve_add.c @@ -0,0 +1,556 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// +// Multi-Single Vector +// + +// x2 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z28test_svadd_vector_single2_s810svint8x2_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint8x2_t test_svadd_vector_single2_s8(svint8x2_t zn, svint8_t zm) { + return SVE_ACLE_FUNC(svadd,_single_s8_x2,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single2_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z28test_svadd_vector_single2_u811svuint8x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint8x2_t test_svadd_vector_single2_u8(svuint8x2_t zn, svuint8_t zm) { + return SVE_ACLE_FUNC(svadd,_single_u8_x2,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_s1611svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint16x2_t test_svadd_vector_single2_s16(svint16x2_t zn, svint16_t zm) { + return SVE_ACLE_FUNC(svadd,_single_s16_x2,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_u1612svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint16x2_t test_svadd_vector_single2_u16(svuint16x2_t zn, svuint16_t zm) { + return SVE_ACLE_FUNC(svadd,_single_u16_x2,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single2_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_s3211svint32x2_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint32x2_t test_svadd_vector_single2_s32(svint32x2_t zn, svint32_t zm) { + return SVE_ACLE_FUNC(svadd,_single_s32_x2,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single2_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_u3212svuint32x2_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint32x2_t test_svadd_vector_single2_u32(svuint32x2_t zn, svuint32_t zm) { + return SVE_ACLE_FUNC(svadd,_single_u32_x2,,)(zn, zm); +} + + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single2_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_s6411svint64x2_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint64x2_t test_svadd_vector_single2_s64(svint64x2_t zn, svint64_t zm) { + return SVE_ACLE_FUNC(svadd,_single_s64_x2,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single2_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_u6412svuint64x2_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint64x2_t test_svadd_vector_single2_u64(svuint64x2_t zn, svuint64_t zm) { + return SVE_ACLE_FUNC(svadd,_single_u64_x2,,)(zn, zm); +} + + +// x4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z28test_svadd_vector_single4_s810svint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint8x4_t test_svadd_vector_single4_s8(svint8x4_t zn, svint8_t zm) { + return SVE_ACLE_FUNC(svadd,_single_s8_x4,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single4_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z28test_svadd_vector_single4_u811svuint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint8x4_t test_svadd_vector_single4_u8(svuint8x4_t zn, svuint8_t zm) { + return SVE_ACLE_FUNC(svadd,_single_u8_x4,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_s1611svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint16x4_t test_svadd_vector_single4_s16(svint16x4_t zn, svint16_t zm) { + return SVE_ACLE_FUNC(svadd,_single_s16_x4,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_u1612svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint16x4_t test_svadd_vector_single4_u16(svuint16x4_t zn, svuint16_t zm) { + return SVE_ACLE_FUNC(svadd,_single_u16_x4,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single4_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_s3211svint32x4_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint32x4_t test_svadd_vector_single4_s32(svint32x4_t zn, svint32_t zm) { + return SVE_ACLE_FUNC(svadd,_single_s32_x4,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single4_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_u3212svuint32x4_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint32x4_t test_svadd_vector_single4_u32(svuint32x4_t zn, svuint32_t zm) { + return SVE_ACLE_FUNC(svadd,_single_u32_x4,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single4_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_s6411svint64x4_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint64x4_t test_svadd_vector_single4_s64(svint64x4_t zn, svint64_t zm) { + return SVE_ACLE_FUNC(svadd,_single_s64_x4,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single4_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_u6412svuint64x4_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint64x4_t test_svadd_vector_single4_u64(svuint64x4_t zn, svuint64_t zm) { + return SVE_ACLE_FUNC(svadd,_single_u64_x4,,)(zn, zm); +} diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2-bfloat.c --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2-bfloat.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2-bfloat.c @@ -14,20 +14,20 @@ #else #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 #endif -// CHECK-LABEL: define {{[^@]+}}@test_svst2_bf16( +// CHECK-LABEL: @test_svst2_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8bf16( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z15test_svst2_bf16u10__SVBool_tPu6__bf1614svbfloat16x2_t( +// CPP-CHECK-LABEL: @_Z15test_svst2_bf16u10__SVBool_tPu6__bf1614svbfloat16x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8bf16( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst2_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x2_t data) @@ -35,22 +35,22 @@ return SVE_ACLE_FUNC(svst2,_bf16,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_bf16( +// CHECK-LABEL: @test_svst2_vnum_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8bf16( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z20test_svst2_vnum_bf16u10__SVBool_tPu6__bf16l14svbfloat16x2_t( +// CPP-CHECK-LABEL: @_Z20test_svst2_vnum_bf16u10__SVBool_tPu6__bf16l14svbfloat16x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8bf16( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16x2_t data) diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c @@ -14,14 +14,14 @@ #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 #endif -// CHECK-LABEL: define {{[^@]+}}@test_svst2_s8( +// CHECK-LABEL: @test_svst2_s8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA:%.*]], i64 0) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA]], i64 16) // CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv16i8( [[TMP0]], [[TMP1]], [[PG:%.*]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z13test_svst2_s8u10__SVBool_tPa10svint8x2_t( +// CPP-CHECK-LABEL: @_Z13test_svst2_s8u10__SVBool_tPa10svint8x2_t( // CPP-CHECK-NEXT: entry: // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA:%.*]], i64 0) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA]], i64 16) @@ -33,20 +33,20 @@ return SVE_ACLE_FUNC(svst2,_s8,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_s16( +// CHECK-LABEL: @test_svst2_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_s16u10__SVBool_tPs11svint16x2_t( +// CPP-CHECK-LABEL: @_Z14test_svst2_s16u10__SVBool_tPs11svint16x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst2_s16(svbool_t pg, int16_t *base, svint16x2_t data) @@ -54,20 +54,20 @@ return SVE_ACLE_FUNC(svst2,_s16,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_s32( +// CHECK-LABEL: @test_svst2_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_s32u10__SVBool_tPi11svint32x2_t( +// CPP-CHECK-LABEL: @_Z14test_svst2_s32u10__SVBool_tPi11svint32x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst2_s32(svbool_t pg, int32_t *base, svint32x2_t data) @@ -75,20 +75,20 @@ return SVE_ACLE_FUNC(svst2,_s32,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_s64( +// CHECK-LABEL: @test_svst2_s64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_s64u10__SVBool_tPl11svint64x2_t( +// CPP-CHECK-LABEL: @_Z14test_svst2_s64u10__SVBool_tPl11svint64x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst2_s64(svbool_t pg, int64_t *base, svint64x2_t data) @@ -96,14 +96,14 @@ return SVE_ACLE_FUNC(svst2,_s64,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_u8( +// CHECK-LABEL: @test_svst2_u8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA:%.*]], i64 0) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA]], i64 16) // CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv16i8( [[TMP0]], [[TMP1]], [[PG:%.*]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z13test_svst2_u8u10__SVBool_tPh11svuint8x2_t( +// CPP-CHECK-LABEL: @_Z13test_svst2_u8u10__SVBool_tPh11svuint8x2_t( // CPP-CHECK-NEXT: entry: // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA:%.*]], i64 0) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA]], i64 16) @@ -115,20 +115,20 @@ return SVE_ACLE_FUNC(svst2,_u8,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_u16( +// CHECK-LABEL: @test_svst2_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_u16u10__SVBool_tPt12svuint16x2_t( +// CPP-CHECK-LABEL: @_Z14test_svst2_u16u10__SVBool_tPt12svuint16x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst2_u16(svbool_t pg, uint16_t *base, svuint16x2_t data) @@ -136,20 +136,20 @@ return SVE_ACLE_FUNC(svst2,_u16,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_u32( +// CHECK-LABEL: @test_svst2_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_u32u10__SVBool_tPj12svuint32x2_t( +// CPP-CHECK-LABEL: @_Z14test_svst2_u32u10__SVBool_tPj12svuint32x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst2_u32(svbool_t pg, uint32_t *base, svuint32x2_t data) @@ -157,20 +157,20 @@ return SVE_ACLE_FUNC(svst2,_u32,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_u64( +// CHECK-LABEL: @test_svst2_u64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_u64u10__SVBool_tPm12svuint64x2_t( +// CPP-CHECK-LABEL: @_Z14test_svst2_u64u10__SVBool_tPm12svuint64x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst2_u64(svbool_t pg, uint64_t *base, svuint64x2_t data) @@ -178,20 +178,20 @@ return SVE_ACLE_FUNC(svst2,_u64,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_f16( +// CHECK-LABEL: @test_svst2_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8f16( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_f16u10__SVBool_tPDh13svfloat16x2_t( +// CPP-CHECK-LABEL: @_Z14test_svst2_f16u10__SVBool_tPDh13svfloat16x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8f16( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst2_f16(svbool_t pg, float16_t *base, svfloat16x2_t data) @@ -199,20 +199,20 @@ return SVE_ACLE_FUNC(svst2,_f16,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_f32( +// CHECK-LABEL: @test_svst2_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4f32( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_f32u10__SVBool_tPf13svfloat32x2_t( +// CPP-CHECK-LABEL: @_Z14test_svst2_f32u10__SVBool_tPf13svfloat32x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4f32( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst2_f32(svbool_t pg, float32_t *base, svfloat32x2_t data) @@ -220,20 +220,20 @@ return SVE_ACLE_FUNC(svst2,_f32,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_f64( +// CHECK-LABEL: @test_svst2_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA]], i64 2) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2f64( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_f64u10__SVBool_tPd13svfloat64x2_t( +// CPP-CHECK-LABEL: @_Z14test_svst2_f64u10__SVBool_tPd13svfloat64x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2f64( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst2_f64(svbool_t pg, float64_t *base, svfloat64x2_t data) @@ -241,20 +241,20 @@ return SVE_ACLE_FUNC(svst2,_f64,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_s8( +// CHECK-LABEL: @test_svst2_vnum_s8( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA]], i64 16) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv16i8( [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv16i8( [[TMP0]], [[TMP1]], [[PG:%.*]], ptr [[TMP2]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z18test_svst2_vnum_s8u10__SVBool_tPal10svint8x2_t( +// CPP-CHECK-LABEL: @_Z18test_svst2_vnum_s8u10__SVBool_tPal10svint8x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA]], i64 16) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv16i8( [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv16i8( [[TMP0]], [[TMP1]], [[PG:%.*]], ptr [[TMP2]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8x2_t data) @@ -262,22 +262,22 @@ return SVE_ACLE_FUNC(svst2_vnum,_s8,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_s16( +// CHECK-LABEL: @test_svst2_vnum_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_s16u10__SVBool_tPsl11svint16x2_t( +// CPP-CHECK-LABEL: @_Z19test_svst2_vnum_s16u10__SVBool_tPsl11svint16x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16x2_t data) @@ -285,22 +285,22 @@ return SVE_ACLE_FUNC(svst2_vnum,_s16,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_s32( +// CHECK-LABEL: @test_svst2_vnum_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_s32u10__SVBool_tPil11svint32x2_t( +// CPP-CHECK-LABEL: @_Z19test_svst2_vnum_s32u10__SVBool_tPil11svint32x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32x2_t data) @@ -308,22 +308,22 @@ return SVE_ACLE_FUNC(svst2_vnum,_s32,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_s64( +// CHECK-LABEL: @test_svst2_vnum_s64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_s64u10__SVBool_tPll11svint64x2_t( +// CPP-CHECK-LABEL: @_Z19test_svst2_vnum_s64u10__SVBool_tPll11svint64x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64x2_t data) @@ -331,20 +331,20 @@ return SVE_ACLE_FUNC(svst2_vnum,_s64,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_u8( +// CHECK-LABEL: @test_svst2_vnum_u8( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA]], i64 16) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv16i8( [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv16i8( [[TMP0]], [[TMP1]], [[PG:%.*]], ptr [[TMP2]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z18test_svst2_vnum_u8u10__SVBool_tPhl11svuint8x2_t( +// CPP-CHECK-LABEL: @_Z18test_svst2_vnum_u8u10__SVBool_tPhl11svuint8x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA]], i64 16) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv16i8( [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv16i8( [[TMP0]], [[TMP1]], [[PG:%.*]], ptr [[TMP2]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8x2_t data) @@ -352,22 +352,22 @@ return SVE_ACLE_FUNC(svst2_vnum,_u8,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_u16( +// CHECK-LABEL: @test_svst2_vnum_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_u16u10__SVBool_tPtl12svuint16x2_t( +// CPP-CHECK-LABEL: @_Z19test_svst2_vnum_u16u10__SVBool_tPtl12svuint16x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16x2_t data) @@ -375,22 +375,22 @@ return SVE_ACLE_FUNC(svst2_vnum,_u16,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_u32( +// CHECK-LABEL: @test_svst2_vnum_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_u32u10__SVBool_tPjl12svuint32x2_t( +// CPP-CHECK-LABEL: @_Z19test_svst2_vnum_u32u10__SVBool_tPjl12svuint32x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32x2_t data) @@ -398,22 +398,22 @@ return SVE_ACLE_FUNC(svst2_vnum,_u32,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_u64( +// CHECK-LABEL: @test_svst2_vnum_u64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_u64u10__SVBool_tPml12svuint64x2_t( +// CPP-CHECK-LABEL: @_Z19test_svst2_vnum_u64u10__SVBool_tPml12svuint64x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64x2_t data) @@ -421,22 +421,22 @@ return SVE_ACLE_FUNC(svst2_vnum,_u64,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_f16( +// CHECK-LABEL: @test_svst2_vnum_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8f16( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_f16u10__SVBool_tPDhl13svfloat16x2_t( +// CPP-CHECK-LABEL: @_Z19test_svst2_vnum_f16u10__SVBool_tPDhl13svfloat16x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8f16( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16x2_t data) @@ -444,22 +444,22 @@ return SVE_ACLE_FUNC(svst2_vnum,_f16,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_f32( +// CHECK-LABEL: @test_svst2_vnum_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4f32( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_f32u10__SVBool_tPfl13svfloat32x2_t( +// CPP-CHECK-LABEL: @_Z19test_svst2_vnum_f32u10__SVBool_tPfl13svfloat32x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4f32( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32x2_t data) @@ -467,22 +467,22 @@ return SVE_ACLE_FUNC(svst2_vnum,_f32,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_f64( +// CHECK-LABEL: @test_svst2_vnum_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA]], i64 2) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2f64( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_f64u10__SVBool_tPdl13svfloat64x2_t( +// CPP-CHECK-LABEL: @_Z19test_svst2_vnum_f64u10__SVBool_tPdl13svfloat64x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2f64( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_f64(svbool_t pg, float64_t *base, int64_t vnum, svfloat64x2_t data) diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3-bfloat.c --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3-bfloat.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3-bfloat.c @@ -15,22 +15,22 @@ #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 #endif -// CHECK-LABEL: define {{[^@]+}}@test_svst3_bf16( +// CHECK-LABEL: @test_svst3_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 16) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8bf16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z15test_svst3_bf16u10__SVBool_tPu6__bf1614svbfloat16x3_t( +// CPP-CHECK-LABEL: @_Z15test_svst3_bf16u10__SVBool_tPu6__bf1614svbfloat16x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8bf16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst3_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x3_t data) @@ -38,24 +38,24 @@ return SVE_ACLE_FUNC(svst3,_bf16,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_bf16( +// CHECK-LABEL: @test_svst3_vnum_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 16) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8bf16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z20test_svst3_vnum_bf16u10__SVBool_tPu6__bf16l14svbfloat16x3_t( +// CPP-CHECK-LABEL: @_Z20test_svst3_vnum_bf16u10__SVBool_tPu6__bf16l14svbfloat16x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8bf16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16x3_t data) diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3.c --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3.c @@ -14,7 +14,7 @@ #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 #endif -// CHECK-LABEL: define {{[^@]+}}@test_svst3_s8( +// CHECK-LABEL: @test_svst3_s8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA:%.*]], i64 0) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 16) @@ -22,7 +22,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z13test_svst3_s8u10__SVBool_tPa10svint8x3_t( +// CPP-CHECK-LABEL: @_Z13test_svst3_s8u10__SVBool_tPa10svint8x3_t( // CPP-CHECK-NEXT: entry: // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA:%.*]], i64 0) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 16) @@ -35,22 +35,22 @@ return SVE_ACLE_FUNC(svst3,_s8,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_s16( +// CHECK-LABEL: @test_svst3_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_s16u10__SVBool_tPs11svint16x3_t( +// CPP-CHECK-LABEL: @_Z14test_svst3_s16u10__SVBool_tPs11svint16x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst3_s16(svbool_t pg, int16_t *base, svint16x3_t data) @@ -58,22 +58,22 @@ return SVE_ACLE_FUNC(svst3,_s16,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_s32( +// CHECK-LABEL: @test_svst3_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_s32u10__SVBool_tPi11svint32x3_t( +// CPP-CHECK-LABEL: @_Z14test_svst3_s32u10__SVBool_tPi11svint32x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst3_s32(svbool_t pg, int32_t *base, svint32x3_t data) @@ -81,22 +81,22 @@ return SVE_ACLE_FUNC(svst3,_s32,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_s64( +// CHECK-LABEL: @test_svst3_s64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_s64u10__SVBool_tPl11svint64x3_t( +// CPP-CHECK-LABEL: @_Z14test_svst3_s64u10__SVBool_tPl11svint64x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst3_s64(svbool_t pg, int64_t *base, svint64x3_t data) @@ -104,7 +104,7 @@ return SVE_ACLE_FUNC(svst3,_s64,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_u8( +// CHECK-LABEL: @test_svst3_u8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA:%.*]], i64 0) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 16) @@ -112,7 +112,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z13test_svst3_u8u10__SVBool_tPh11svuint8x3_t( +// CPP-CHECK-LABEL: @_Z13test_svst3_u8u10__SVBool_tPh11svuint8x3_t( // CPP-CHECK-NEXT: entry: // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA:%.*]], i64 0) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 16) @@ -125,22 +125,22 @@ return SVE_ACLE_FUNC(svst3,_u8,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_u16( +// CHECK-LABEL: @test_svst3_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_u16u10__SVBool_tPt12svuint16x3_t( +// CPP-CHECK-LABEL: @_Z14test_svst3_u16u10__SVBool_tPt12svuint16x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst3_u16(svbool_t pg, uint16_t *base, svuint16x3_t data) @@ -148,22 +148,22 @@ return SVE_ACLE_FUNC(svst3,_u16,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_u32( +// CHECK-LABEL: @test_svst3_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_u32u10__SVBool_tPj12svuint32x3_t( +// CPP-CHECK-LABEL: @_Z14test_svst3_u32u10__SVBool_tPj12svuint32x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst3_u32(svbool_t pg, uint32_t *base, svuint32x3_t data) @@ -171,22 +171,22 @@ return SVE_ACLE_FUNC(svst3,_u32,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_u64( +// CHECK-LABEL: @test_svst3_u64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_u64u10__SVBool_tPm12svuint64x3_t( +// CPP-CHECK-LABEL: @_Z14test_svst3_u64u10__SVBool_tPm12svuint64x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst3_u64(svbool_t pg, uint64_t *base, svuint64x3_t data) @@ -194,22 +194,22 @@ return SVE_ACLE_FUNC(svst3,_u64,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_f16( +// CHECK-LABEL: @test_svst3_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 16) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8f16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_f16u10__SVBool_tPDh13svfloat16x3_t( +// CPP-CHECK-LABEL: @_Z14test_svst3_f16u10__SVBool_tPDh13svfloat16x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8f16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst3_f16(svbool_t pg, float16_t *base, svfloat16x3_t data) @@ -217,22 +217,22 @@ return SVE_ACLE_FUNC(svst3,_f16,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_f32( +// CHECK-LABEL: @test_svst3_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4f32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_f32u10__SVBool_tPf13svfloat32x3_t( +// CPP-CHECK-LABEL: @_Z14test_svst3_f32u10__SVBool_tPf13svfloat32x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4f32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst3_f32(svbool_t pg, float32_t *base, svfloat32x3_t data) @@ -240,22 +240,22 @@ return SVE_ACLE_FUNC(svst3,_f32,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_f64( +// CHECK-LABEL: @test_svst3_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2f64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_f64u10__SVBool_tPd13svfloat64x3_t( +// CPP-CHECK-LABEL: @_Z14test_svst3_f64u10__SVBool_tPd13svfloat64x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2f64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst3_f64(svbool_t pg, float64_t *base, svfloat64x3_t data) @@ -263,22 +263,22 @@ return SVE_ACLE_FUNC(svst3,_f64,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_s8( +// CHECK-LABEL: @test_svst3_vnum_s8( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 32) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv16i8( [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[TMP3]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z18test_svst3_vnum_s8u10__SVBool_tPal10svint8x3_t( +// CPP-CHECK-LABEL: @_Z18test_svst3_vnum_s8u10__SVBool_tPal10svint8x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 32) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv16i8( [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8x3_t data) @@ -286,24 +286,24 @@ return SVE_ACLE_FUNC(svst3_vnum,_s8,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_s16( +// CHECK-LABEL: @test_svst3_vnum_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_s16u10__SVBool_tPsl11svint16x3_t( +// CPP-CHECK-LABEL: @_Z19test_svst3_vnum_s16u10__SVBool_tPsl11svint16x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16x3_t data) @@ -311,24 +311,24 @@ return SVE_ACLE_FUNC(svst3_vnum,_s16,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_s32( +// CHECK-LABEL: @test_svst3_vnum_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_s32u10__SVBool_tPil11svint32x3_t( +// CPP-CHECK-LABEL: @_Z19test_svst3_vnum_s32u10__SVBool_tPil11svint32x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32x3_t data) @@ -336,24 +336,24 @@ return SVE_ACLE_FUNC(svst3_vnum,_s32,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_s64( +// CHECK-LABEL: @test_svst3_vnum_s64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_s64u10__SVBool_tPll11svint64x3_t( +// CPP-CHECK-LABEL: @_Z19test_svst3_vnum_s64u10__SVBool_tPll11svint64x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64x3_t data) @@ -361,22 +361,22 @@ return SVE_ACLE_FUNC(svst3_vnum,_s64,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_u8( +// CHECK-LABEL: @test_svst3_vnum_u8( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 32) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv16i8( [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[TMP3]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z18test_svst3_vnum_u8u10__SVBool_tPhl11svuint8x3_t( +// CPP-CHECK-LABEL: @_Z18test_svst3_vnum_u8u10__SVBool_tPhl11svuint8x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 32) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv16i8( [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8x3_t data) @@ -384,24 +384,24 @@ return SVE_ACLE_FUNC(svst3_vnum,_u8,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_u16( +// CHECK-LABEL: @test_svst3_vnum_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_u16u10__SVBool_tPtl12svuint16x3_t( +// CPP-CHECK-LABEL: @_Z19test_svst3_vnum_u16u10__SVBool_tPtl12svuint16x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16x3_t data) @@ -409,24 +409,24 @@ return SVE_ACLE_FUNC(svst3_vnum,_u16,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_u32( +// CHECK-LABEL: @test_svst3_vnum_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_u32u10__SVBool_tPjl12svuint32x3_t( +// CPP-CHECK-LABEL: @_Z19test_svst3_vnum_u32u10__SVBool_tPjl12svuint32x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32x3_t data) @@ -434,24 +434,24 @@ return SVE_ACLE_FUNC(svst3_vnum,_u32,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_u64( +// CHECK-LABEL: @test_svst3_vnum_u64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_u64u10__SVBool_tPml12svuint64x3_t( +// CPP-CHECK-LABEL: @_Z19test_svst3_vnum_u64u10__SVBool_tPml12svuint64x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64x3_t data) @@ -459,24 +459,24 @@ return SVE_ACLE_FUNC(svst3_vnum,_u64,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_f16( +// CHECK-LABEL: @test_svst3_vnum_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 16) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8f16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_f16u10__SVBool_tPDhl13svfloat16x3_t( +// CPP-CHECK-LABEL: @_Z19test_svst3_vnum_f16u10__SVBool_tPDhl13svfloat16x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8f16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16x3_t data) @@ -484,24 +484,24 @@ return SVE_ACLE_FUNC(svst3_vnum,_f16,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_f32( +// CHECK-LABEL: @test_svst3_vnum_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4f32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_f32u10__SVBool_tPfl13svfloat32x3_t( +// CPP-CHECK-LABEL: @_Z19test_svst3_vnum_f32u10__SVBool_tPfl13svfloat32x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4f32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32x3_t data) @@ -509,24 +509,24 @@ return SVE_ACLE_FUNC(svst3_vnum,_f32,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_f64( +// CHECK-LABEL: @test_svst3_vnum_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2f64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_f64u10__SVBool_tPdl13svfloat64x3_t( +// CPP-CHECK-LABEL: @_Z19test_svst3_vnum_f64u10__SVBool_tPdl13svfloat64x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2f64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_f64(svbool_t pg, float64_t *base, int64_t vnum, svfloat64x3_t data) diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4-bfloat.c --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4-bfloat.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4-bfloat.c @@ -15,24 +15,24 @@ #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 #endif -// CHECK-LABEL: define {{[^@]+}}@test_svst4_bf16( +// CHECK-LABEL: @test_svst4_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 24) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8bf16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z15test_svst4_bf16u10__SVBool_tPu6__bf1614svbfloat16x4_t( +// CPP-CHECK-LABEL: @_Z15test_svst4_bf16u10__SVBool_tPu6__bf1614svbfloat16x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 24) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8bf16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst4_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x4_t data) @@ -40,26 +40,26 @@ return SVE_ACLE_FUNC(svst4,_bf16,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_bf16( +// CHECK-LABEL: @test_svst4_vnum_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 24) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8bf16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z20test_svst4_vnum_bf16u10__SVBool_tPu6__bf16l14svbfloat16x4_t( +// CPP-CHECK-LABEL: @_Z20test_svst4_vnum_bf16u10__SVBool_tPu6__bf16l14svbfloat16x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 24) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8bf16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16x4_t data) diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4.c --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4.c @@ -14,7 +14,7 @@ #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 #endif -// CHECK-LABEL: define {{[^@]+}}@test_svst4_s8( +// CHECK-LABEL: @test_svst4_s8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA:%.*]], i64 0) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 16) @@ -23,7 +23,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z13test_svst4_s8u10__SVBool_tPa10svint8x4_t( +// CPP-CHECK-LABEL: @_Z13test_svst4_s8u10__SVBool_tPa10svint8x4_t( // CPP-CHECK-NEXT: entry: // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA:%.*]], i64 0) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 16) @@ -37,24 +37,24 @@ return SVE_ACLE_FUNC(svst4,_s8,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_s16( +// CHECK-LABEL: @test_svst4_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_s16u10__SVBool_tPs11svint16x4_t( +// CPP-CHECK-LABEL: @_Z14test_svst4_s16u10__SVBool_tPs11svint16x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst4_s16(svbool_t pg, int16_t *base, svint16x4_t data) @@ -62,24 +62,24 @@ return SVE_ACLE_FUNC(svst4,_s16,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_s32( +// CHECK-LABEL: @test_svst4_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_s32u10__SVBool_tPi11svint32x4_t( +// CPP-CHECK-LABEL: @_Z14test_svst4_s32u10__SVBool_tPi11svint32x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst4_s32(svbool_t pg, int32_t *base, svint32x4_t data) @@ -87,24 +87,24 @@ return SVE_ACLE_FUNC(svst4,_s32,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_s64( +// CHECK-LABEL: @test_svst4_s64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_s64u10__SVBool_tPl11svint64x4_t( +// CPP-CHECK-LABEL: @_Z14test_svst4_s64u10__SVBool_tPl11svint64x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst4_s64(svbool_t pg, int64_t *base, svint64x4_t data) @@ -112,7 +112,7 @@ return SVE_ACLE_FUNC(svst4,_s64,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_u8( +// CHECK-LABEL: @test_svst4_u8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA:%.*]], i64 0) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 16) @@ -121,7 +121,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z13test_svst4_u8u10__SVBool_tPh11svuint8x4_t( +// CPP-CHECK-LABEL: @_Z13test_svst4_u8u10__SVBool_tPh11svuint8x4_t( // CPP-CHECK-NEXT: entry: // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA:%.*]], i64 0) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 16) @@ -135,24 +135,24 @@ return SVE_ACLE_FUNC(svst4,_u8,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_u16( +// CHECK-LABEL: @test_svst4_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_u16u10__SVBool_tPt12svuint16x4_t( +// CPP-CHECK-LABEL: @_Z14test_svst4_u16u10__SVBool_tPt12svuint16x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst4_u16(svbool_t pg, uint16_t *base, svuint16x4_t data) @@ -160,24 +160,24 @@ return SVE_ACLE_FUNC(svst4,_u16,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_u32( +// CHECK-LABEL: @test_svst4_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_u32u10__SVBool_tPj12svuint32x4_t( +// CPP-CHECK-LABEL: @_Z14test_svst4_u32u10__SVBool_tPj12svuint32x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst4_u32(svbool_t pg, uint32_t *base, svuint32x4_t data) @@ -185,24 +185,24 @@ return SVE_ACLE_FUNC(svst4,_u32,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_u64( +// CHECK-LABEL: @test_svst4_u64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_u64u10__SVBool_tPm12svuint64x4_t( +// CPP-CHECK-LABEL: @_Z14test_svst4_u64u10__SVBool_tPm12svuint64x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst4_u64(svbool_t pg, uint64_t *base, svuint64x4_t data) @@ -210,24 +210,24 @@ return SVE_ACLE_FUNC(svst4,_u64,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_f16( +// CHECK-LABEL: @test_svst4_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 24) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8f16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_f16u10__SVBool_tPDh13svfloat16x4_t( +// CPP-CHECK-LABEL: @_Z14test_svst4_f16u10__SVBool_tPDh13svfloat16x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 24) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8f16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst4_f16(svbool_t pg, float16_t *base, svfloat16x4_t data) @@ -235,24 +235,24 @@ return SVE_ACLE_FUNC(svst4,_f16,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_f32( +// CHECK-LABEL: @test_svst4_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 12) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4f32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_f32u10__SVBool_tPf13svfloat32x4_t( +// CPP-CHECK-LABEL: @_Z14test_svst4_f32u10__SVBool_tPf13svfloat32x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 12) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4f32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst4_f32(svbool_t pg, float32_t *base, svfloat32x4_t data) @@ -260,24 +260,24 @@ return SVE_ACLE_FUNC(svst4,_f32,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_f64( +// CHECK-LABEL: @test_svst4_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 6) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2f64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_f64u10__SVBool_tPd13svfloat64x4_t( +// CPP-CHECK-LABEL: @_Z14test_svst4_f64u10__SVBool_tPd13svfloat64x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 6) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2f64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst4_f64(svbool_t pg, float64_t *base, svfloat64x4_t data) @@ -285,24 +285,24 @@ return SVE_ACLE_FUNC(svst4,_f64,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_s8( +// CHECK-LABEL: @test_svst4_vnum_s8( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 32) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 48) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv16i8( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[PG:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[TMP4]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z18test_svst4_vnum_s8u10__SVBool_tPal10svint8x4_t( +// CPP-CHECK-LABEL: @_Z18test_svst4_vnum_s8u10__SVBool_tPal10svint8x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 32) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 48) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv16i8( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[PG:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8x4_t data) @@ -310,26 +310,26 @@ return SVE_ACLE_FUNC(svst4_vnum,_s8,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_s16( +// CHECK-LABEL: @test_svst4_vnum_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_s16u10__SVBool_tPsl11svint16x4_t( +// CPP-CHECK-LABEL: @_Z19test_svst4_vnum_s16u10__SVBool_tPsl11svint16x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16x4_t data) @@ -337,26 +337,26 @@ return SVE_ACLE_FUNC(svst4_vnum,_s16,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_s32( +// CHECK-LABEL: @test_svst4_vnum_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_s32u10__SVBool_tPil11svint32x4_t( +// CPP-CHECK-LABEL: @_Z19test_svst4_vnum_s32u10__SVBool_tPil11svint32x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32x4_t data) @@ -364,26 +364,26 @@ return SVE_ACLE_FUNC(svst4_vnum,_s32,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_s64( +// CHECK-LABEL: @test_svst4_vnum_s64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_s64u10__SVBool_tPll11svint64x4_t( +// CPP-CHECK-LABEL: @_Z19test_svst4_vnum_s64u10__SVBool_tPll11svint64x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64x4_t data) @@ -391,24 +391,24 @@ return SVE_ACLE_FUNC(svst4_vnum,_s64,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_u8( +// CHECK-LABEL: @test_svst4_vnum_u8( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 32) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 48) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv16i8( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[PG:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[TMP4]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z18test_svst4_vnum_u8u10__SVBool_tPhl11svuint8x4_t( +// CPP-CHECK-LABEL: @_Z18test_svst4_vnum_u8u10__SVBool_tPhl11svuint8x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 32) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 48) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv16i8( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[PG:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8x4_t data) @@ -416,26 +416,26 @@ return SVE_ACLE_FUNC(svst4_vnum,_u8,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_u16( +// CHECK-LABEL: @test_svst4_vnum_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_u16u10__SVBool_tPtl12svuint16x4_t( +// CPP-CHECK-LABEL: @_Z19test_svst4_vnum_u16u10__SVBool_tPtl12svuint16x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16x4_t data) @@ -443,26 +443,26 @@ return SVE_ACLE_FUNC(svst4_vnum,_u16,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_u32( +// CHECK-LABEL: @test_svst4_vnum_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_u32u10__SVBool_tPjl12svuint32x4_t( +// CPP-CHECK-LABEL: @_Z19test_svst4_vnum_u32u10__SVBool_tPjl12svuint32x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32x4_t data) @@ -470,26 +470,26 @@ return SVE_ACLE_FUNC(svst4_vnum,_u32,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_u64( +// CHECK-LABEL: @test_svst4_vnum_u64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_u64u10__SVBool_tPml12svuint64x4_t( +// CPP-CHECK-LABEL: @_Z19test_svst4_vnum_u64u10__SVBool_tPml12svuint64x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64x4_t data) @@ -497,26 +497,26 @@ return SVE_ACLE_FUNC(svst4_vnum,_u64,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_f16( +// CHECK-LABEL: @test_svst4_vnum_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 24) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8f16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_f16u10__SVBool_tPDhl13svfloat16x4_t( +// CPP-CHECK-LABEL: @_Z19test_svst4_vnum_f16u10__SVBool_tPDhl13svfloat16x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 24) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8f16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16x4_t data) @@ -524,26 +524,26 @@ return SVE_ACLE_FUNC(svst4_vnum,_f16,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_f32( +// CHECK-LABEL: @test_svst4_vnum_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 12) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4f32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_f32u10__SVBool_tPfl13svfloat32x4_t( +// CPP-CHECK-LABEL: @_Z19test_svst4_vnum_f32u10__SVBool_tPfl13svfloat32x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 12) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4f32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32x4_t data) @@ -551,26 +551,26 @@ return SVE_ACLE_FUNC(svst4_vnum,_f32,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_f64( +// CHECK-LABEL: @test_svst4_vnum_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 6) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2f64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_f64u10__SVBool_tPdl13svfloat64x4_t( +// CPP-CHECK-LABEL: @_Z19test_svst4_vnum_f64u10__SVBool_tPdl13svfloat64x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 6) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2f64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_f64(svbool_t pg, float64_t *base, int64_t vnum, svfloat64x4_t data) diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_revd.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_revd.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_revd.c @@ -0,0 +1,390 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svrevd_s8_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( zeroinitializer, [[PG:%.*]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svrevd_s8_zu10__SVBool_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( zeroinitializer, [[PG:%.*]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint8_t test_svrevd_s8_z(svbool_t pg, svint8_t op) { + return SVE_ACLE_FUNC(svrevd, _s8, _z, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_s16_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_s16_zu10__SVBool_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint16_t test_svrevd_s16_z(svbool_t pg, svint16_t op) { + return SVE_ACLE_FUNC(svrevd, _s16, _z, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_s32_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_s32_zu10__SVBool_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint32_t test_svrevd_s32_z(svbool_t pg, svint32_t op) { + return SVE_ACLE_FUNC(svrevd, _s32, _z, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_s64_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_s64_zu10__SVBool_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint64_t test_svrevd_s64_z(svbool_t pg, svint64_t op) { + return SVE_ACLE_FUNC(svrevd, _s64, _z, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_u8_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( zeroinitializer, [[PG:%.*]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svrevd_u8_zu10__SVBool_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( zeroinitializer, [[PG:%.*]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint8_t test_svrevd_u8_z(svbool_t pg, svuint8_t op) { + return SVE_ACLE_FUNC(svrevd, _u8, _z, )(pg, op); +} +// CHECK-LABEL: @test_svrevd_u16_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_u16_zu10__SVBool_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint16_t test_svrevd_u16_z(svbool_t pg, svuint16_t op) { + return SVE_ACLE_FUNC(svrevd, _u16, _z, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_u32_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_u32_zu10__SVBool_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint32_t test_svrevd_u32_z(svbool_t pg, svuint32_t op) { + return SVE_ACLE_FUNC(svrevd, _u32, _z, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_u64_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_u64_zu10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint64_t test_svrevd_u64_z(svbool_t pg, svuint64_t op) { + return SVE_ACLE_FUNC(svrevd, _u64, _z, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_s8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( [[INACTIVE:%.*]], [[PG:%.*]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svrevd_s8_mu10__SVInt8_tu10__SVBool_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( [[INACTIVE:%.*]], [[PG:%.*]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint8_t test_svrevd_s8_m(svint8_t inactive, svbool_t pg, svint8_t op) { + return SVE_ACLE_FUNC(svrevd, _s8, _m, )(inactive, pg, op); +} + +// CHECK-LABEL: @test_svrevd_s16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_s16_mu11__SVInt16_tu10__SVBool_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint16_t test_svrevd_s16_m(svint16_t inactive, svbool_t pg, svint16_t op) { + return SVE_ACLE_FUNC(svrevd, _s16, _m, )(inactive, pg, op); +} + +// CHECK-LABEL: @test_svrevd_s32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_s32_mu11__SVInt32_tu10__SVBool_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint32_t test_svrevd_s32_m(svint32_t inactive, svbool_t pg, svint32_t op) { + return SVE_ACLE_FUNC(svrevd, _s32, _m, )(inactive, pg, op); +} + +// CHECK-LABEL: @test_svrevd_s64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_s64_mu11__SVInt64_tu10__SVBool_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint64_t test_svrevd_s64_m(svint64_t inactive, svbool_t pg, svint64_t op) { + return SVE_ACLE_FUNC(svrevd, _s64, _m, )(inactive, pg, op); +} + +// CHECK-LABEL: @test_svrevd_u8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( [[INACTIVE:%.*]], [[PG:%.*]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svrevd_u8_mu11__SVUint8_tu10__SVBool_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( [[INACTIVE:%.*]], [[PG:%.*]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint8_t test_svrevd_u8_m(svuint8_t inactive, svbool_t pg, svuint8_t op) { + return SVE_ACLE_FUNC(svrevd, _u8, _m, )(inactive, pg, op); +} + +// CHECK-LABEL: @test_svrevd_u16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_u16_mu12__SVUint16_tu10__SVBool_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint16_t test_svrevd_u16_m(svuint16_t inactive, svbool_t pg, svuint16_t op) { + return SVE_ACLE_FUNC(svrevd, _u16, _m, )(inactive, pg, op); +} + +// CHECK-LABEL: @test_svrevd_u32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_u32_mu12__SVUint32_tu10__SVBool_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint32_t test_svrevd_u32_m(svuint32_t inactive, svbool_t pg, svuint32_t op) { + return SVE_ACLE_FUNC(svrevd, _u32, _m, )(inactive, pg, op); +} + +// CHECK-LABEL: @test_svrevd_u64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_u64_mu12__SVUint64_tu10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint64_t test_svrevd_u64_m(svuint64_t inactive, svbool_t pg, svuint64_t op) { + return SVE_ACLE_FUNC(svrevd, _u64, _m, )(inactive, pg, op); +} + +// CHECK-LABEL: @test_svrevd_s8_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( undef, [[PG:%.*]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svrevd_s8_xu10__SVBool_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( undef, [[PG:%.*]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint8_t test_svrevd_s8_x(svbool_t pg, svint8_t op) { + return SVE_ACLE_FUNC(svrevd, _s8, _x, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_s16_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( undef, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_s16_xu10__SVBool_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( undef, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint16_t test_svrevd_s16_x(svbool_t pg, svint16_t op) { + return SVE_ACLE_FUNC(svrevd, _s16, _x, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_s32_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( undef, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_s32_xu10__SVBool_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( undef, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint32_t test_svrevd_s32_x(svbool_t pg, svint32_t op) { + return SVE_ACLE_FUNC(svrevd, _s32, _x, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_s64_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( undef, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_s64_xu10__SVBool_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( undef, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint64_t test_svrevd_s64_x(svbool_t pg, svint64_t op) { + return SVE_ACLE_FUNC(svrevd, _s64, _x, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_u8_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( undef, [[PG:%.*]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svrevd_u8_xu10__SVBool_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( undef, [[PG:%.*]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint8_t test_svrevd_u8_x(svbool_t pg, svuint8_t op) { + return SVE_ACLE_FUNC(svrevd, _u8, _x, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_u16_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( undef, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_u16_xu10__SVBool_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( undef, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint16_t test_svrevd_u16_x(svbool_t pg, svuint16_t op) { + return SVE_ACLE_FUNC(svrevd, _u16, _x, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_u32_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( undef, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_u32_xu10__SVBool_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( undef, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint32_t test_svrevd_u32_x(svbool_t pg, svuint32_t op) { + return SVE_ACLE_FUNC(svrevd, _u32, _x, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_u64_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( undef, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_u64_xu10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( undef, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint64_t test_svrevd_u64_x(svbool_t pg, svuint64_t op) { + return SVE_ACLE_FUNC(svrevd, _u64, _x, )(pg, op); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfadd.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfadd.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfadd.c @@ -0,0 +1,68 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +// CHECK-LABEL: @test_svadd_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fadd.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svadd_bf16_mu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fadd.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svadd_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svadd, _bf16, _m)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svadd_bf16_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fadd.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z17test_svadd_bf16_zu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fadd.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_svadd_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svadd, _bf16, _z)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svadd_bf16_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fadd.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svadd_bf16_xu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fadd.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svadd_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svadd, _bf16, _x)(pg, op1, op2); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmax.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmax.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmax.c @@ -0,0 +1,68 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +// CHECK-LABEL: @test_svmax_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmax.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_bf16_mu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmax.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svmax_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmax, _bf16, _m)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svmax_bf16_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fmax.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_bf16_zu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fmax.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_svmax_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmax, _bf16, _z)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svmax_bf16_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmax.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_bf16_xu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmax.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svmax_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmax, _bf16, _x)(pg, op1, op2); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmaxnm.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmaxnm.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmaxnm.c @@ -0,0 +1,68 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +// CHECK-LABEL: @test_svmaxnm_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmaxnm.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z19test_svmaxnm_bf16_mu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmaxnm.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svmaxnm_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmaxnm, _bf16, _m)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svmaxnm_bf16_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fmaxnm.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z19test_svmaxnm_bf16_zu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fmaxnm.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_svmaxnm_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmaxnm, _bf16, _z)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svmaxnm_bf16_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmaxnm.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z19test_svmaxnm_bf16_xu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmaxnm.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svmaxnm_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmaxnm, _bf16, _x)(pg, op1, op2); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmin.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmin.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmin.c @@ -0,0 +1,68 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +// CHECK-LABEL: @test_svmin_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmin.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_bf16_mu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmin.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svmin_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmin, _bf16, _m)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svmin_bf16_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fmin.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_bf16_zu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fmin.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_svmin_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmin, _bf16, _z)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svmin_bf16_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmin.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_bf16_xu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmin.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svmin_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmin, _bf16, _x)(pg, op1, op2); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfminnm.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfminnm.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfminnm.c @@ -0,0 +1,68 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +// CHECK-LABEL: @test_svminnm_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fminnm.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z19test_svminnm_bf16_mu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fminnm.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svminnm_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svminnm, _bf16, _m)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svminnm_bf16_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fminnm.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z19test_svminnm_bf16_zu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fminnm.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_svminnm_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svminnm, _bf16, _z)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svminnm_bf16_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fminnm.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z19test_svminnm_bf16_xu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fminnm.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svminnm_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svminnm, _bf16, _x)(pg, op1, op2); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmlsl.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmlsl.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmlsl.c @@ -0,0 +1,85 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// BFMLSLB + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_bfmlslb( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.bfmlslb( [[ZDA:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z12test_bfmlslbu13__SVFloat32_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.bfmlslb( [[ZDA:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat32_t test_bfmlslb(svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm) +{ + return SVE_ACLE_FUNC(svbfmlslb,_f32,,)(zda, zn, zm); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_bfmlslb_idx( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.bfmlslb.lane( [[ZDA:%.*]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_bfmlslb_idxu13__SVFloat32_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.bfmlslb.lane( [[ZDA:%.*]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat32_t test_bfmlslb_idx(svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm) +{ + return SVE_ACLE_FUNC(svbfmlslb_lane,_f32,,)(zda, zn, zm, 7); +} + +// BFMLSLT + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_bfmlslt( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.bfmlslt( [[ZDA:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z12test_bfmlsltu13__SVFloat32_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.bfmlslt( [[ZDA:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat32_t test_bfmlslt(svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm) +{ + return SVE_ACLE_FUNC(svbfmlslt,_f32,,)(zda, zn, zm); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_bfmlslt_idx( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.bfmlslt.lane( [[ZDA:%.*]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_bfmlslt_idxu13__SVFloat32_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.bfmlslt.lane( [[ZDA:%.*]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat32_t test_bfmlslt_idx(svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm) +{ + return SVE_ACLE_FUNC(svbfmlslt_lane,_f32,,)(zda, zn, zm, 7); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul.c @@ -0,0 +1,68 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +// CHECK-LABEL: @test_svmul_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmul.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svmul_bf16_mu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmul.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svmul_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmul, _bf16, _m)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svmul_bf16_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fmul.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z17test_svmul_bf16_zu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fmul.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_svmul_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmul, _bf16, _z)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svmul_bf16_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmul.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svmul_bf16_xu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmul.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svmul_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmul, _bf16, _x)(pg, op1, op2); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul_lane.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul_lane.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul_lane.c @@ -0,0 +1,61 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +// CHECK-LABEL: @test_svmul_lane_bf16_idx1( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmul.lane.nxv8bf16( [[OP1:%.*]], [[OP2:%.*]], i32 1) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z25test_svmul_lane_bf16_idx1u14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmul.lane.nxv8bf16( [[OP1:%.*]], [[OP2:%.*]], i32 1) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbfloat16_t test_svmul_lane_bf16_idx1(svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmul_lane, _bf16, )(op1, op2, 1); +} + +// CHECK-LABEL: @test_svmul_lane_bf16_idx3( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmul.lane.nxv8bf16( [[OP1:%.*]], [[OP2:%.*]], i32 3) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z25test_svmul_lane_bf16_idx3u14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmul.lane.nxv8bf16( [[OP1:%.*]], [[OP2:%.*]], i32 3) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbfloat16_t test_svmul_lane_bf16_idx3(svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmul_lane, _bf16, )(op1, op2, 3); +} + +// CHECK-LABEL: @test_svmul_lane_bf16_idx7( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmul.lane.nxv8bf16( [[OP1:%.*]], [[OP2:%.*]], i32 7) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z25test_svmul_lane_bf16_idx7u14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmul.lane.nxv8bf16( [[OP1:%.*]], [[OP2:%.*]], i32 7) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbfloat16_t test_svmul_lane_bf16_idx7(svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmul_lane, _bf16, )(op1, op2, 7); +} + diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfsub.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfsub.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfsub.c @@ -0,0 +1,68 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +// CHECK-LABEL: @test_svsub_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fsub.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svsub_bf16_mu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fsub.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svsub_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svsub, _bf16, _m)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svsub_bf16_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fsub.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z17test_svsub_bf16_zu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fsub.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_svsub_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svsub, _bf16, _z)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svsub_bf16_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fsub.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svsub_bf16_xu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fsub.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svsub_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svsub, _bf16, _x)(pg, op1, op2); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_cntp.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_cntp.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_cntp.c @@ -0,0 +1,127 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svcntp_c8_vlx2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c8(target("aarch64.svcount") [[PNN:%.*]], i32 2) +// CHECK-NEXT: ret i64 [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z19test_svcntp_c8_vlx2u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c8(target("aarch64.svcount") [[PNN:%.*]], i32 2) +// CPP-CHECK-NEXT: ret i64 [[TMP0]] +// +uint64_t test_svcntp_c8_vlx2(svcount_t pnn) { + return svcntp_c8(pnn, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svcntp_c8_vlx4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c8(target("aarch64.svcount") [[PNN:%.*]], i32 4) +// CHECK-NEXT: ret i64 [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z19test_svcntp_c8_vlx4u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c8(target("aarch64.svcount") [[PNN:%.*]], i32 4) +// CPP-CHECK-NEXT: ret i64 [[TMP0]] +// +uint64_t test_svcntp_c8_vlx4(svcount_t pnn) { + return svcntp_c8(pnn, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svcntp_c16_vlx2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c16(target("aarch64.svcount") [[PNN:%.*]], i32 2) +// CHECK-NEXT: ret i64 [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z20test_svcntp_c16_vlx2u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c16(target("aarch64.svcount") [[PNN:%.*]], i32 2) +// CPP-CHECK-NEXT: ret i64 [[TMP0]] +// +uint64_t test_svcntp_c16_vlx2(svcount_t pnn) { + return svcntp_c16(pnn, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svcntp_c16_vlx4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c16(target("aarch64.svcount") [[PNN:%.*]], i32 4) +// CHECK-NEXT: ret i64 [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z20test_svcntp_c16_vlx4u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c16(target("aarch64.svcount") [[PNN:%.*]], i32 4) +// CPP-CHECK-NEXT: ret i64 [[TMP0]] +// +uint64_t test_svcntp_c16_vlx4(svcount_t pnn) { + return svcntp_c16(pnn, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svcntp_c32_vlx2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c32(target("aarch64.svcount") [[PNN:%.*]], i32 2) +// CHECK-NEXT: ret i64 [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z20test_svcntp_c32_vlx2u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c32(target("aarch64.svcount") [[PNN:%.*]], i32 2) +// CPP-CHECK-NEXT: ret i64 [[TMP0]] +// +uint64_t test_svcntp_c32_vlx2(svcount_t pnn) { + return svcntp_c32(pnn, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svcntp_c32_vlx4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c32(target("aarch64.svcount") [[PNN:%.*]], i32 4) +// CHECK-NEXT: ret i64 [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z20test_svcntp_c32_vlx4u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c32(target("aarch64.svcount") [[PNN:%.*]], i32 4) +// CPP-CHECK-NEXT: ret i64 [[TMP0]] +// +uint64_t test_svcntp_c32_vlx4(svcount_t pnn) { + return svcntp_c32(pnn, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svcntp_c64_vlx2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c64(target("aarch64.svcount") [[PNN:%.*]], i32 2) +// CHECK-NEXT: ret i64 [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z20test_svcntp_c64_vlx2u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c64(target("aarch64.svcount") [[PNN:%.*]], i32 2) +// CPP-CHECK-NEXT: ret i64 [[TMP0]] +// +uint64_t test_svcntp_c64_vlx2(svcount_t pnn) { + return svcntp_c64(pnn, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svcntp_c64_vlx4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c64(target("aarch64.svcount") [[PNN:%.*]], i32 4) +// CHECK-NEXT: ret i64 [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z20test_svcntp_c64_vlx4u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c64(target("aarch64.svcount") [[PNN:%.*]], i32 4) +// CPP-CHECK-NEXT: ret i64 [[TMP0]] +// +uint64_t test_svcntp_c64_vlx4(svcount_t pnn) { + return svcntp_c64(pnn, 4); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_dot.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_dot.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_dot.c @@ -0,0 +1,107 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +// CHECK-LABEL: @test_svdot_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.sdot.x2.nxv4i32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z17test_svdot_s32_x2u11__SVInt32_tu11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.sdot.x2.nxv4i32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint32_t test_svdot_s32_x2(svint32_t op1, svint16_t op2, svint16_t op3) +{ + return SVE_ACLE_FUNC(svdot,_s32_s16_s16,)(op1, op2, op3); +} + +// CHECK-LABEL: @test_svdot_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.udot.x2.nxv4i32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z17test_svdot_u32_x2u12__SVUint32_tu12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.udot.x2.nxv4i32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint32_t test_svdot_u32_x2(svuint32_t op1, svuint16_t op2, svuint16_t op3) +{ + return SVE_ACLE_FUNC(svdot,_u32_u16_u16,)(op1, op2, op3); +} + +// CHECK-LABEL: @test_svdot_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fdot.x2.nxv4f32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z17test_svdot_f32_x2u13__SVFloat32_tu13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fdot.x2.nxv4f32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat32_t test_svdot_f32_x2(svfloat32_t op1, svfloat16_t op2, svfloat16_t op3) +{ + return SVE_ACLE_FUNC(svdot,_f32_f16_f16,)(op1, op2, op3); +} + + + +// CHECK-LABEL: @test_svdot_lane_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.sdot.lane.x2.nxv4i32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]], i32 3) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svdot_lane_s32_x2u11__SVInt32_tu11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.sdot.lane.x2.nxv4i32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]], i32 3) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint32_t test_svdot_lane_s32_x2(svint32_t op1, svint16_t op2, svint16_t op3) +{ + return SVE_ACLE_FUNC(svdot_lane,_s32_s16_s16,)(op1, op2, op3, 3); +} + +// CHECK-LABEL: @test_svdot_lane_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.udot.lane.x2.nxv4i32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]], i32 3) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svdot_lane_u32_x2u12__SVUint32_tu12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.udot.lane.x2.nxv4i32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]], i32 3) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint32_t test_svdot_lane_u32_x2(svuint32_t op1, svuint16_t op2, svuint16_t op3) +{ + return SVE_ACLE_FUNC(svdot_lane,_u32_u16_u16,)(op1, op2, op3, 3); +} + +// CHECK-LABEL: @test_svdot_lane_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fdot.lane.x2.nxv4f32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]], i32 3) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svdot_lane_f32_x2u13__SVFloat32_tu13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fdot.lane.x2.nxv4f32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]], i32 3) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat32_t test_svdot_lane_f32_x2(svfloat32_t op1, svfloat16_t op2, svfloat16_t op3) +{ + return SVE_ACLE_FUNC(svdot_lane,_f32_f16_f16,)(op1, op2, op3, 3); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_fclamp.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_fclamp.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_fclamp.c @@ -0,0 +1,265 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svclamp_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fclamp.nxv8f16( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svclamp_f16u13__SVFloat16_tu13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fclamp.nxv8f16( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat16_t test_svclamp_f16(svfloat16_t op1, svfloat16_t op2, svfloat16_t op3) { + return SVE_ACLE_FUNC(svclamp, _f16, , )(op1, op2, op3); +} + +// CHECK-LABEL: @test_svclamp_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fclamp.nxv4f32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svclamp_f32u13__SVFloat32_tu13__SVFloat32_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fclamp.nxv4f32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat32_t test_svclamp_f32(svfloat32_t op1, svfloat32_t op2, svfloat32_t op3) { + return SVE_ACLE_FUNC(svclamp, _f32, , )(op1, op2, op3); +} + +// CHECK-LABEL: @test_svclamp_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fclamp.nxv2f64( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svclamp_f64u13__SVFloat64_tu13__SVFloat64_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fclamp.nxv2f64( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat64_t test_svclamp_f64(svfloat64_t op1, svfloat64_t op2, svfloat64_t op3) { + return SVE_ACLE_FUNC(svclamp, _f64, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[OP1]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fclamp.single.x2.nxv8f16( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_f16_x213svfloat16x2_tu13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[OP1]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fclamp.single.x2.nxv8f16( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat16x2_t test_svclamp_single_f16_x2(svfloat16x2_t op1, svfloat16_t op2, svfloat16_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_f16_x2, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[OP1]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fclamp.single.x2.nxv4f32( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_f32_x213svfloat32x2_tu13__SVFloat32_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[OP1]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fclamp.single.x2.nxv4f32( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat32x2_t test_svclamp_single_f32_x2(svfloat32x2_t op1, svfloat32_t op2, svfloat32_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_f32_x2, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[OP1]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fclamp.single.x2.nxv2f64( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_f64_x213svfloat64x2_tu13__SVFloat64_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[OP1]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fclamp.single.x2.nxv2f64( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat64x2_t test_svclamp_single_f64_x2(svfloat64x2_t op1, svfloat64_t op2, svfloat64_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_f64_x2, , )(op1, op2, op3); +} + + + + + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[OP1]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[OP1]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[OP1]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fclamp.single.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_f16_x413svfloat16x4_tu13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[OP1]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[OP1]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[OP1]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fclamp.single.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat16x4_t test_svclamp_single_f16_x4(svfloat16x4_t op1, svfloat16_t op2, svfloat16_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_f16_x4, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[OP1]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[OP1]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[OP1]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fclamp.single.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_f32_x413svfloat32x4_tu13__SVFloat32_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[OP1]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[OP1]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[OP1]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fclamp.single.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat32x4_t test_svclamp_single_f32_x4(svfloat32x4_t op1, svfloat32_t op2, svfloat32_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_f32_x4, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[OP1]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[OP1]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[OP1]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fclamp.single.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_f64_x413svfloat64x4_tu13__SVFloat64_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[OP1]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[OP1]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[OP1]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fclamp.single.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat64x4_t test_svclamp_single_f64_x4(svfloat64x4_t op1, svfloat64_t op2, svfloat64_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_f64_x4, , )(op1, op2, op3); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1.c @@ -0,0 +1,1294 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z16test_svld1_u8_x2u11__SVCount_tPKh( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint8x2_t test_svld1_u8_x2(svcount_t pn, const uint8_t *base) +{ + return SVE_ACLE_FUNC(svld1,_u8,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_u16_x2u11__SVCount_tPKt( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint16x2_t test_svld1_u16_x2(svcount_t pn, const uint16_t *base) +{ + return SVE_ACLE_FUNC(svld1,_u16,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_u32_x2u11__SVCount_tPKj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint32x2_t test_svld1_u32_x2(svcount_t pn, const uint32_t *base) +{ + return SVE_ACLE_FUNC(svld1,_u32,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_u64_x2u11__SVCount_tPKm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint64x2_t test_svld1_u64_x2(svcount_t pn, const uint64_t *base) +{ + return SVE_ACLE_FUNC(svld1,_u64,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z16test_svld1_u8_x4u11__SVCount_tPKh( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint8x4_t test_svld1_u8_x4(svcount_t pn, const uint8_t *base) +{ + return SVE_ACLE_FUNC(svld1,_u8,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_u16_x4u11__SVCount_tPKt( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint16x4_t test_svld1_u16_x4(svcount_t pn, const uint16_t *base) +{ + return SVE_ACLE_FUNC(svld1,_u16,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_u32_x4u11__SVCount_tPKj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint32x4_t test_svld1_u32_x4(svcount_t pn, const uint32_t *base) +{ + return SVE_ACLE_FUNC(svld1,_u32,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_u64_x4u11__SVCount_tPKm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint64x4_t test_svld1_u64_x4(svcount_t pn, const uint64_t *base) +{ + return SVE_ACLE_FUNC(svld1,_u64,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z16test_svld1_s8_x2u11__SVCount_tPKa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint8x2_t test_svld1_s8_x2(svcount_t pn, const int8_t *base) +{ + return SVE_ACLE_FUNC(svld1,_s8,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_s16_x2u11__SVCount_tPKs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint16x2_t test_svld1_s16_x2(svcount_t pn, const int16_t *base) +{ + return SVE_ACLE_FUNC(svld1,_s16,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_s32_x2u11__SVCount_tPKi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint32x2_t test_svld1_s32_x2(svcount_t pn, const int32_t *base) +{ + return SVE_ACLE_FUNC(svld1,_s32,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_s64_x2u11__SVCount_tPKl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint64x2_t test_svld1_s64_x2(svcount_t pn, const int64_t *base) +{ + return SVE_ACLE_FUNC(svld1,_s64,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z16test_svld1_s8_x4u11__SVCount_tPKa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint8x4_t test_svld1_s8_x4(svcount_t pn, const int8_t *base) +{ + return SVE_ACLE_FUNC(svld1,_s8,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_s16_x4u11__SVCount_tPKs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint16x4_t test_svld1_s16_x4(svcount_t pn, const int16_t *base) +{ + return SVE_ACLE_FUNC(svld1,_s16,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_s32_x4u11__SVCount_tPKi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint32x4_t test_svld1_s32_x4(svcount_t pn, const int32_t *base) +{ + return SVE_ACLE_FUNC(svld1,_s32,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_s64_x4u11__SVCount_tPKl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint64x4_t test_svld1_s64_x4(svcount_t pn, const int64_t *base) +{ + return SVE_ACLE_FUNC(svld1,_s64,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_f16_x2u11__SVCount_tPKDh( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat16x2_t test_svld1_f16_x2(svcount_t pn, const float16_t *base) +{ + return SVE_ACLE_FUNC(svld1,_f16,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_f32_x2u11__SVCount_tPKf( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat32x2_t test_svld1_f32_x2(svcount_t pn, const float32_t *base) +{ + return SVE_ACLE_FUNC(svld1,_f32,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_f64_x2u11__SVCount_tPKd( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat64x2_t test_svld1_f64_x2(svcount_t pn, const float64_t *base) +{ + return SVE_ACLE_FUNC(svld1,_f64,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_f16_x4u11__SVCount_tPKDh( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat16x4_t test_svld1_f16_x4(svcount_t pn, const float16_t *base) +{ + return SVE_ACLE_FUNC(svld1,_f16,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_f32_x4u11__SVCount_tPKf( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat32x4_t test_svld1_f32_x4(svcount_t pn, const float32_t *base) +{ + return SVE_ACLE_FUNC(svld1,_f32,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_f64_x4u11__SVCount_tPKd( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat64x4_t test_svld1_f64_x4(svcount_t pn, const float64_t *base) +{ + return SVE_ACLE_FUNC(svld1,_f64,_x4,)(pn, base); +} + + +// == VNUM variants == + + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z21test_svld1_vnum_u8_x2u11__SVCount_tPKhl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint8x2_t test_svld1_vnum_u8_x2(svcount_t pn, const uint8_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_u8,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_u16_x2u11__SVCount_tPKtl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint16x2_t test_svld1_vnum_u16_x2(svcount_t pn, const uint16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_u16,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_u32_x2u11__SVCount_tPKjl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint32x2_t test_svld1_vnum_u32_x2(svcount_t pn, const uint32_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_u32,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_u64_x2u11__SVCount_tPKml( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint64x2_t test_svld1_vnum_u64_x2(svcount_t pn, const uint64_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_u64,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z21test_svld1_vnum_u8_x4u11__SVCount_tPKhl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint8x4_t test_svld1_vnum_u8_x4(svcount_t pn, const uint8_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_u8,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_u16_x4u11__SVCount_tPKtl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint16x4_t test_svld1_vnum_u16_x4(svcount_t pn, const uint16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_u16,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP5]], [[TMP6]], i64 8) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP7]], [[TMP8]], i64 12) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_u32_x4u11__SVCount_tPKjl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP5]], [[TMP6]], i64 8) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP7]], [[TMP8]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint32x4_t test_svld1_vnum_u32_x4(svcount_t pn, const uint32_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_u32,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP7]], [[TMP8]], i64 6) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_u64_x4u11__SVCount_tPKml( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP7]], [[TMP8]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint64x4_t test_svld1_vnum_u64_x4(svcount_t pn, const uint64_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_u64,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z21test_svld1_vnum_s8_x2u11__SVCount_tPKal( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint8x2_t test_svld1_vnum_s8_x2(svcount_t pn, const int8_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_s8,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_s16_x2u11__SVCount_tPKsl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint16x2_t test_svld1_vnum_s16_x2(svcount_t pn, const int16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_s16,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_s32_x2u11__SVCount_tPKil( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint32x2_t test_svld1_vnum_s32_x2(svcount_t pn, const int32_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_s32,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_s64_x2u11__SVCount_tPKll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint64x2_t test_svld1_vnum_s64_x2(svcount_t pn, const int64_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_s64,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z21test_svld1_vnum_s8_x4u11__SVCount_tPKal( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint8x4_t test_svld1_vnum_s8_x4(svcount_t pn, const int8_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_s8,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_s16_x4u11__SVCount_tPKsl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint16x4_t test_svld1_vnum_s16_x4(svcount_t pn, const int16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_s16,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP5]], [[TMP6]], i64 8) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP7]], [[TMP8]], i64 12) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_s32_x4u11__SVCount_tPKil( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP5]], [[TMP6]], i64 8) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP7]], [[TMP8]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint32x4_t test_svld1_vnum_s32_x4(svcount_t pn, const int32_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_s32,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP7]], [[TMP8]], i64 6) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_s64_x4u11__SVCount_tPKll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP7]], [[TMP8]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint64x4_t test_svld1_vnum_s64_x4(svcount_t pn, const int64_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_s64,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_f16_x2u11__SVCount_tPKDhl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svfloat16x2_t test_svld1_vnum_f16_x2(svcount_t pn, const float16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_f16,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_f32_x2u11__SVCount_tPKfl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svfloat32x2_t test_svld1_vnum_f32_x2(svcount_t pn, const float32_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_f32,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_f64_x2u11__SVCount_tPKdl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svfloat64x2_t test_svld1_vnum_f64_x2(svcount_t pn, const float64_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_f64,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_f16_x4u11__SVCount_tPKDhl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svfloat16x4_t test_svld1_vnum_f16_x4(svcount_t pn, const float16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_f16,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP5]], [[TMP6]], i64 8) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP7]], [[TMP8]], i64 12) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_f32_x4u11__SVCount_tPKfl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP5]], [[TMP6]], i64 8) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP7]], [[TMP8]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svfloat32x4_t test_svld1_vnum_f32_x4(svcount_t pn, const float32_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_f32,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP5]], [[TMP6]], i64 4) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP7]], [[TMP8]], i64 6) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_f64_x4u11__SVCount_tPKdl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP5]], [[TMP6]], i64 4) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP7]], [[TMP8]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svfloat64x4_t test_svld1_vnum_f64_x4(svcount_t pn, const float64_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_f64,_x4,)(pn, base, vnum); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ldnt1.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ldnt1.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ldnt1.c @@ -0,0 +1,1294 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svldnt1_u8_x2u11__SVCount_tPKh( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint8x2_t test_svldnt1_u8_x2(svcount_t pn, const uint8_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_u8,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_u16_x2u11__SVCount_tPKt( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint16x2_t test_svldnt1_u16_x2(svcount_t pn, const uint16_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_u16,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_u32_x2u11__SVCount_tPKj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint32x2_t test_svldnt1_u32_x2(svcount_t pn, const uint32_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_u32,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_u64_x2u11__SVCount_tPKm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint64x2_t test_svldnt1_u64_x2(svcount_t pn, const uint64_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_u64,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svldnt1_u8_x4u11__SVCount_tPKh( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint8x4_t test_svldnt1_u8_x4(svcount_t pn, const uint8_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_u8,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_u16_x4u11__SVCount_tPKt( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint16x4_t test_svldnt1_u16_x4(svcount_t pn, const uint16_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_u16,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_u32_x4u11__SVCount_tPKj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint32x4_t test_svldnt1_u32_x4(svcount_t pn, const uint32_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_u32,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_u64_x4u11__SVCount_tPKm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint64x4_t test_svldnt1_u64_x4(svcount_t pn, const uint64_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_u64,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svldnt1_s8_x2u11__SVCount_tPKa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint8x2_t test_svldnt1_s8_x2(svcount_t pn, const int8_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_s8,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_s16_x2u11__SVCount_tPKs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint16x2_t test_svldnt1_s16_x2(svcount_t pn, const int16_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_s16,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_s32_x2u11__SVCount_tPKi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint32x2_t test_svldnt1_s32_x2(svcount_t pn, const int32_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_s32,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_s64_x2u11__SVCount_tPKl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint64x2_t test_svldnt1_s64_x2(svcount_t pn, const int64_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_s64,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svldnt1_s8_x4u11__SVCount_tPKa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint8x4_t test_svldnt1_s8_x4(svcount_t pn, const int8_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_s8,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_s16_x4u11__SVCount_tPKs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint16x4_t test_svldnt1_s16_x4(svcount_t pn, const int16_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_s16,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_s32_x4u11__SVCount_tPKi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint32x4_t test_svldnt1_s32_x4(svcount_t pn, const int32_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_s32,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_s64_x4u11__SVCount_tPKl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint64x4_t test_svldnt1_s64_x4(svcount_t pn, const int64_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_s64,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_f16_x2u11__SVCount_tPKDh( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat16x2_t test_svldnt1_f16_x2(svcount_t pn, const float16_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_f16,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_f32_x2u11__SVCount_tPKf( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat32x2_t test_svldnt1_f32_x2(svcount_t pn, const float32_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_f32,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_f64_x2u11__SVCount_tPKd( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat64x2_t test_svldnt1_f64_x2(svcount_t pn, const float64_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_f64,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_f16_x4u11__SVCount_tPKDh( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat16x4_t test_svldnt1_f16_x4(svcount_t pn, const float16_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_f16,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_f32_x4u11__SVCount_tPKf( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat32x4_t test_svldnt1_f32_x4(svcount_t pn, const float32_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_f32,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_f64_x4u11__SVCount_tPKd( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat64x4_t test_svldnt1_f64_x4(svcount_t pn, const float64_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_f64,_x4,)(pn, base); +} + + +// == VNUM variants == + + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z23test_svldnt1_vnum_u8_x2u11__SVCount_tPKhl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint8x2_t test_svldnt1_vnum_u8_x2(svcount_t pn, const uint8_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_u8,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_u16_x2u11__SVCount_tPKtl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint16x2_t test_svldnt1_vnum_u16_x2(svcount_t pn, const uint16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_u16,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_u32_x2u11__SVCount_tPKjl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint32x2_t test_svldnt1_vnum_u32_x2(svcount_t pn, const uint32_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_u32,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_u64_x2u11__SVCount_tPKml( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint64x2_t test_svldnt1_vnum_u64_x2(svcount_t pn, const uint64_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_u64,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z23test_svldnt1_vnum_u8_x4u11__SVCount_tPKhl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint8x4_t test_svldnt1_vnum_u8_x4(svcount_t pn, const uint8_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_u8,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_u16_x4u11__SVCount_tPKtl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint16x4_t test_svldnt1_vnum_u16_x4(svcount_t pn, const uint16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_u16,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP5]], [[TMP6]], i64 8) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP7]], [[TMP8]], i64 12) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_u32_x4u11__SVCount_tPKjl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP5]], [[TMP6]], i64 8) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP7]], [[TMP8]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint32x4_t test_svldnt1_vnum_u32_x4(svcount_t pn, const uint32_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_u32,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP7]], [[TMP8]], i64 6) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_u64_x4u11__SVCount_tPKml( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP7]], [[TMP8]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint64x4_t test_svldnt1_vnum_u64_x4(svcount_t pn, const uint64_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_u64,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z23test_svldnt1_vnum_s8_x2u11__SVCount_tPKal( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint8x2_t test_svldnt1_vnum_s8_x2(svcount_t pn, const int8_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_s8,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_s16_x2u11__SVCount_tPKsl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint16x2_t test_svldnt1_vnum_s16_x2(svcount_t pn, const int16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_s16,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_s32_x2u11__SVCount_tPKil( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint32x2_t test_svldnt1_vnum_s32_x2(svcount_t pn, const int32_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_s32,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_s64_x2u11__SVCount_tPKll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint64x2_t test_svldnt1_vnum_s64_x2(svcount_t pn, const int64_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_s64,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z23test_svldnt1_vnum_s8_x4u11__SVCount_tPKal( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint8x4_t test_svldnt1_vnum_s8_x4(svcount_t pn, const int8_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_s8,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_s16_x4u11__SVCount_tPKsl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint16x4_t test_svldnt1_vnum_s16_x4(svcount_t pn, const int16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_s16,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP5]], [[TMP6]], i64 8) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP7]], [[TMP8]], i64 12) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_s32_x4u11__SVCount_tPKil( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP5]], [[TMP6]], i64 8) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP7]], [[TMP8]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint32x4_t test_svldnt1_vnum_s32_x4(svcount_t pn, const int32_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_s32,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP7]], [[TMP8]], i64 6) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_s64_x4u11__SVCount_tPKll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP7]], [[TMP8]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint64x4_t test_svldnt1_vnum_s64_x4(svcount_t pn, const int64_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_s64,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_f16_x2u11__SVCount_tPKDhl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svfloat16x2_t test_svldnt1_vnum_f16_x2(svcount_t pn, const float16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_f16,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_f32_x2u11__SVCount_tPKfl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svfloat32x2_t test_svldnt1_vnum_f32_x2(svcount_t pn, const float32_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_f32,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_f64_x2u11__SVCount_tPKdl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svfloat64x2_t test_svldnt1_vnum_f64_x2(svcount_t pn, const float64_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_f64,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_f16_x4u11__SVCount_tPKDhl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svfloat16x4_t test_svldnt1_vnum_f16_x4(svcount_t pn, const float16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_f16,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP5]], [[TMP6]], i64 8) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP7]], [[TMP8]], i64 12) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_f32_x4u11__SVCount_tPKfl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP5]], [[TMP6]], i64 8) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP7]], [[TMP8]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svfloat32x4_t test_svldnt1_vnum_f32_x4(svcount_t pn, const float32_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_f32,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP5]], [[TMP6]], i64 4) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP7]], [[TMP8]], i64 6) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_f64_x4u11__SVCount_tPKdl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP5]], [[TMP6]], i64 4) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP7]], [[TMP8]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svfloat64x4_t test_svldnt1_vnum_f64_x4(svcount_t pn, const float64_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_f64,_x4,)(pn, base, vnum); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pext.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pext.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pext.c @@ -0,0 +1,346 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK + +#include + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svpext_lane_c8_0( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pext.nxv16i1(target("aarch64.svcount") [[C:%.*]], i32 0) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svpext_lane_c8_0u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pext.nxv16i1(target("aarch64.svcount") [[C:%.*]], i32 0) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbool_t test_svpext_lane_c8_0(svcount_t c) { + return svpext_lane_c8(c, 0); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svpext_lane_c8_3( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pext.nxv16i1(target("aarch64.svcount") [[C:%.*]], i32 3) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svpext_lane_c8_3u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pext.nxv16i1(target("aarch64.svcount") [[C:%.*]], i32 3) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbool_t test_svpext_lane_c8_3(svcount_t c) { + return svpext_lane_c8(c, 3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svpext_lane_c16_0( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pext.nxv8i1(target("aarch64.svcount") [[C:%.*]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z22test_svpext_lane_c16_0u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pext.nxv8i1(target("aarch64.svcount") [[C:%.*]], i32 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbool_t test_svpext_lane_c16_0(svcount_t c) { + return svpext_lane_c16(c, 0); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svpext_lane_c16_3( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pext.nxv8i1(target("aarch64.svcount") [[C:%.*]], i32 3) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z22test_svpext_lane_c16_3u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pext.nxv8i1(target("aarch64.svcount") [[C:%.*]], i32 3) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbool_t test_svpext_lane_c16_3(svcount_t c) { + return svpext_lane_c16(c, 3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svpext_lane_c32_0( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pext.nxv4i1(target("aarch64.svcount") [[C:%.*]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z22test_svpext_lane_c32_0u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pext.nxv4i1(target("aarch64.svcount") [[C:%.*]], i32 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbool_t test_svpext_lane_c32_0(svcount_t c) { + return svpext_lane_c32(c, 0); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svpext_lane_c32_3( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pext.nxv4i1(target("aarch64.svcount") [[C:%.*]], i32 3) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z22test_svpext_lane_c32_3u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pext.nxv4i1(target("aarch64.svcount") [[C:%.*]], i32 3) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbool_t test_svpext_lane_c32_3(svcount_t c) { + return svpext_lane_c32(c, 3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svpext_lane_c64_0( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pext.nxv2i1(target("aarch64.svcount") [[C:%.*]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z22test_svpext_lane_c64_0u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pext.nxv2i1(target("aarch64.svcount") [[C:%.*]], i32 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbool_t test_svpext_lane_c64_0(svcount_t c) { + return svpext_lane_c64(c, 0); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svpext_lane_c64_3( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pext.nxv2i1(target("aarch64.svcount") [[C:%.*]], i32 3) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z22test_svpext_lane_c64_3u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pext.nxv2i1(target("aarch64.svcount") [[C:%.*]], i32 3) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbool_t test_svpext_lane_c64_3(svcount_t c) { + return svpext_lane_c64(c, 3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svpext_lane_c8_x2_0( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.pext.x2.nxv16i1(target("aarch64.svcount") [[C:%.*]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z24test_svpext_lane_c8_x2_0u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.pext.x2.nxv16i1(target("aarch64.svcount") [[C:%.*]], i32 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svboolx2_t test_svpext_lane_c8_x2_0(svcount_t c) { + return svpext_lane_c8_x2(c, 0); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svpext_lane_c8_x2_1( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.pext.x2.nxv16i1(target("aarch64.svcount") [[C:%.*]], i32 1) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z24test_svpext_lane_c8_x2_1u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.pext.x2.nxv16i1(target("aarch64.svcount") [[C:%.*]], i32 1) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svboolx2_t test_svpext_lane_c8_x2_1(svcount_t c) { + return svpext_lane_c8_x2(c, 1); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svpext_lane_c16_x2_0( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.pext.x2.nxv8i1(target("aarch64.svcount") [[C:%.*]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z25test_svpext_lane_c16_x2_0u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.pext.x2.nxv8i1(target("aarch64.svcount") [[C:%.*]], i32 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svpext_lane_c16_x2_0(svcount_t c) { + return svpext_lane_c16_x2(c, 0); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svpext_lane_c16_x2_1( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.pext.x2.nxv8i1(target("aarch64.svcount") [[C:%.*]], i32 1) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z25test_svpext_lane_c16_x2_1u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.pext.x2.nxv8i1(target("aarch64.svcount") [[C:%.*]], i32 1) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svpext_lane_c16_x2_1(svcount_t c) { + return svpext_lane_c16_x2(c, 1); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svpext_lane_c32_x2_0( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.pext.x2.nxv4i1(target("aarch64.svcount") [[C:%.*]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z25test_svpext_lane_c32_x2_0u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.pext.x2.nxv4i1(target("aarch64.svcount") [[C:%.*]], i32 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svpext_lane_c32_x2_0(svcount_t c) { + return svpext_lane_c32_x2(c, 0); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svpext_lane_c32_x2_1( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.pext.x2.nxv4i1(target("aarch64.svcount") [[C:%.*]], i32 1) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z25test_svpext_lane_c32_x2_1u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.pext.x2.nxv4i1(target("aarch64.svcount") [[C:%.*]], i32 1) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svpext_lane_c32_x2_1(svcount_t c) { + return svpext_lane_c32_x2(c, 1); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svpext_lane_c64_x2_0( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.pext.x2.nxv2i1(target("aarch64.svcount") [[C:%.*]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z25test_svpext_lane_c64_x2_0u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.pext.x2.nxv2i1(target("aarch64.svcount") [[C:%.*]], i32 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svpext_lane_c64_x2_0(svcount_t c) { + return svpext_lane_c64_x2(c, 0); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svpext_lane_c64_x2_1( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.pext.x2.nxv2i1(target("aarch64.svcount") [[C:%.*]], i32 1) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z25test_svpext_lane_c64_x2_1u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.pext.x2.nxv2i1(target("aarch64.svcount") [[C:%.*]], i32 1) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svpext_lane_c64_x2_1(svcount_t c) { + return svpext_lane_c64_x2(c, 1); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pfalse.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pfalse.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pfalse.c @@ -0,0 +1,30 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svpfalse_c( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt( zeroinitializer) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z15test_svpfalse_cv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt( zeroinitializer) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svpfalse_c() +{ + return SVE_ACLE_FUNC(svpfalse_c,,,)(); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_psel.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_psel.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_psel.c @@ -0,0 +1,164 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sme2 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sme2 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK + +#include + +// CHECK-LABEL: @test_svpsel_lane_b8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[IDX:%.*]], 15 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.psel.nxv16i1( [[P1:%.*]], [[P2:%.*]], i32 [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z19test_svpsel_lane_b8u10__SVBool_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[IDX:%.*]], 15 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.psel.nxv16i1( [[P1:%.*]], [[P2:%.*]], i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbool_t test_svpsel_lane_b8(svbool_t p1, svbool_t p2, uint32_t idx) { + return svpsel_lane_b8(p1, p2, idx, 15); +} + +// CHECK-LABEL: @test_svpsel_lane_b16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[P2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[IDX:%.*]], 7 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.psel.nxv8i1( [[P1:%.*]], [[TMP0]], i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z20test_svpsel_lane_b16u10__SVBool_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[P2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[IDX:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.psel.nxv8i1( [[P1:%.*]], [[TMP0]], i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbool_t test_svpsel_lane_b16(svbool_t p1, svbool_t p2, uint32_t idx) { + return svpsel_lane_b16(p1, p2, idx, 7); +} + +// CHECK-LABEL: @test_svpsel_lane_b32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[P2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[IDX:%.*]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.psel.nxv4i1( [[P1:%.*]], [[TMP0]], i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z20test_svpsel_lane_b32u10__SVBool_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[P2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[IDX:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.psel.nxv4i1( [[P1:%.*]], [[TMP0]], i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbool_t test_svpsel_lane_b32(svbool_t p1, svbool_t p2, uint32_t idx) { + return svpsel_lane_b32(p1, p2, idx, 3); +} + +// CHECK-LABEL: @test_svpsel_lane_b64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[P2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[IDX:%.*]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.psel.nxv2i1( [[P1:%.*]], [[TMP0]], i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z20test_svpsel_lane_b64u10__SVBool_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[P2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[IDX:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.psel.nxv2i1( [[P1:%.*]], [[TMP0]], i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbool_t test_svpsel_lane_b64(svbool_t p1, svbool_t p2, uint32_t idx) { + return svpsel_lane_b64(p1, p2, idx, 1); +} + +// CHECK-LABEL: @test_svpsel_lane_c8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount") [[P1:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[IDX:%.*]], 15 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.psel.nxv16i1( [[TMP0]], [[P2:%.*]], i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt( [[TMP2]]) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP3]] +// +// CPP-CHECK-LABEL: @_Z19test_svpsel_lane_c8u11__SVCount_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount") [[P1:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[IDX:%.*]], 15 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.psel.nxv16i1( [[TMP0]], [[P2:%.*]], i32 [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt( [[TMP2]]) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP3]] +// +svcount_t test_svpsel_lane_c8(svcount_t p1, svbool_t p2, uint32_t idx) { + return svpsel_lane_c8(p1, p2, idx, 15); +} + +// CHECK-LABEL: @test_svpsel_lane_c16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount") [[P1:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[P2:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[IDX:%.*]], 7 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.psel.nxv8i1( [[TMP0]], [[TMP1]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt( [[TMP3]]) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_svpsel_lane_c16u11__SVCount_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount") [[P1:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[P2:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[IDX:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.psel.nxv8i1( [[TMP0]], [[TMP1]], i32 [[TMP2]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt( [[TMP3]]) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP4]] +// +svcount_t test_svpsel_lane_c16(svcount_t p1, svbool_t p2, uint32_t idx) { + return svpsel_lane_c16(p1, p2, idx, 7); +} + +// CHECK-LABEL: @test_svpsel_lane_c32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount") [[P1:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[P2:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[IDX:%.*]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.psel.nxv4i1( [[TMP0]], [[TMP1]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt( [[TMP3]]) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_svpsel_lane_c32u11__SVCount_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount") [[P1:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[P2:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[IDX:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.psel.nxv4i1( [[TMP0]], [[TMP1]], i32 [[TMP2]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt( [[TMP3]]) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP4]] +// +svcount_t test_svpsel_lane_c32(svcount_t p1, svbool_t p2, uint32_t idx) { + return svpsel_lane_c32(p1, p2, idx, 3); +} + +// CHECK-LABEL: @test_svpsel_lane_c64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount") [[P1:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[P2:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[IDX:%.*]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.psel.nxv2i1( [[TMP0]], [[TMP1]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt( [[TMP3]]) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_svpsel_lane_c64u11__SVCount_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount") [[P1:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[P2:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[IDX:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.psel.nxv2i1( [[TMP0]], [[TMP1]], i32 [[TMP2]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt( [[TMP3]]) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP4]] +// +svcount_t test_svpsel_lane_c64(svcount_t p1, svbool_t p2, uint32_t idx) { + return svpsel_lane_c64(p1, p2, idx, 1); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ptrue.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ptrue.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ptrue.c @@ -0,0 +1,66 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK + +#include + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svptrue_c8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z15test_svptrue_c8v( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svptrue_c8(void) { + return svptrue_c8(); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svptrue_c16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c16() +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svptrue_c16v( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c16() +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svptrue_c16(void) { + return svptrue_c16(); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svptrue_c32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c32() +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svptrue_c32v( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c32() +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svptrue_c32(void) { + return svptrue_c32(); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svptrue_c64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c64() +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svptrue_c64v( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c64() +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svptrue_c64(void) { + return svptrue_c64(); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c @@ -0,0 +1,428 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// SQRSHR x 2 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svqrshr_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.sqrshr.x2.nxv4i32( [[TMP0]], [[TMP1]], i32 16) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z19test_svqrshr_s16_x211svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.sqrshr.x2.nxv4i32( [[TMP0]], [[TMP1]], i32 16) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svint16_t test_svqrshr_s16_x2(svint32x2_t zn) { + return SVE_ACLE_FUNC(svqrshr,_s16_x2,,,)(zn, 16); +} + +// UQRSHR x 2 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svqrshr_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.uqrshr.x2.nxv4i32( [[TMP0]], [[TMP1]], i32 16) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z19test_svqrshr_u16_x212svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.uqrshr.x2.nxv4i32( [[TMP0]], [[TMP1]], i32 16) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint16_t test_svqrshr_u16_x2(svuint32x2_t zn) { + return SVE_ACLE_FUNC(svqrshr,_u16_x2,,,)(zn, 16); +} + +// SQRSHR x 4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svqrshr_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqrshr.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], i32 32) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svqrshr_s8_x411svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqrshr.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], i32 32) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint8_t test_svqrshr_s8_x4(svint32x4_t zn) { + return SVE_ACLE_FUNC(svqrshr,_s8_x4,,,)(zn, 32); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svqrshr_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqrshr.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], i32 64) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z19test_svqrshr_s16_x411svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqrshr.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], i32 64) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint16_t test_svqrshr_s16_x4(svint64x4_t zn) { + return SVE_ACLE_FUNC(svqrshr,_s16_x4,,,)(zn, 64); +} + +// UQRSHR x 4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svqrshr_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.uqrshr.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], i32 32) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svqrshr_u8_x412svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.uqrshr.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], i32 32) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint8_t test_svqrshr_u8_x4(svuint32x4_t zn) { + return SVE_ACLE_FUNC(svqrshr,_u8_x4,,,)(zn, 32); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svqrshr_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.uqrshr.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], i32 64) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z19test_svqrshr_u16_x412svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.uqrshr.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], i32 64) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint16_t test_svqrshr_u16_x4(svuint64x4_t zn) { + return SVE_ACLE_FUNC(svqrshr,_u16_x4,,,)(zn, 64); +} + +// SQRSHRN x 2 + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svqrshrn_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.sqrshrn.x2.nxv4i32( [[TMP0]], [[TMP1]], i32 16) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z20test_svqrshrn_s16_x211svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.sqrshrn.x2.nxv4i32( [[TMP0]], [[TMP1]], i32 16) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svint16_t test_svqrshrn_s16_x2(svint32x2_t zn) { + return SVE_ACLE_FUNC(svqrshrn,_s16_x2,,,)(zn, 16); +} + +// UQRSHRN x 2 + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svqrshrn_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.uqrshrn.x2.nxv4i32( [[TMP0]], [[TMP1]], i32 16) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z20test_svqrshrn_u16_x212svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.uqrshrn.x2.nxv4i32( [[TMP0]], [[TMP1]], i32 16) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint16_t test_svqrshrn_u16_x2(svuint32x2_t zn) { + return SVE_ACLE_FUNC(svqrshrn,_u16_x2,,,)(zn, 16); +} + +// SQRSHRN x 4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svqrshrn_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqrshrn.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], i32 32) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z19test_svqrshrn_s8_x411svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqrshrn.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], i32 32) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint8_t test_svqrshrn_s8_x4(svint32x4_t zn) { + return SVE_ACLE_FUNC(svqrshrn,_s8_x4,,,)(zn, 32); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svqrshrn_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqrshrn.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], i32 64) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_svqrshrn_s16_x411svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqrshrn.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], i32 64) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint16_t test_svqrshrn_s16_x4(svint64x4_t zn) { + return SVE_ACLE_FUNC(svqrshrn,_s16_x4,,,)(zn, 64); +} + +// UQRSHRN x 4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svqrshrn_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.uqrshrn.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], i32 32) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z19test_svqrshrn_u8_x412svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.uqrshrn.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], i32 32) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint8_t test_svqrshrn_u8_x4(svuint32x4_t zn) { + return SVE_ACLE_FUNC(svqrshrn,_u8_x4,,,)(zn, 32); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svqrshrn_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.uqrshrn.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], i32 64) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_svqrshrn_u16_x412svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.uqrshrn.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], i32 64) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint16_t test_svqrshrn_u16_x4(svuint64x4_t zn) { + return SVE_ACLE_FUNC(svqrshrn,_u16_x4,,,)(zn, 64); +} + +// SQRSHRU x 2 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqrshru_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.sqrshru.x2.nxv4i32( [[TMP0]], [[TMP1]], i32 16) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z21test_svsqrshru_u16_x211svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.sqrshru.x2.nxv4i32( [[TMP0]], [[TMP1]], i32 16) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint16_t test_svsqrshru_u16_x2(svint32x2_t zn) { + return SVE_ACLE_FUNC(svsqrshru,_u16_x2,,,)(zn, 16); +} + +// SQRSHRU x 4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqrshru_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqrshru.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], i32 32) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_svsqrshru_u8_x411svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqrshru.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], i32 32) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint8_t test_svsqrshru_u8_x4(svint32x4_t zn) { + return SVE_ACLE_FUNC(svsqrshru,_u8_x4,,,)(zn, 32); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqrshru_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqrshru.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], i32 64) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z21test_svsqrshru_u16_x411svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqrshru.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], i32 64) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint16_t test_svsqrshru_u16_x4(svint64x4_t zn) { + return SVE_ACLE_FUNC(svsqrshru,_u16_x4,,,)(zn, 64); +} + +// SQRSHRUN x 2 + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svqrshrun_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.sqrshrun.x2.nxv4i32( [[TMP0]], [[TMP1]], i32 16) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z21test_svqrshrun_u16_x211svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.sqrshrun.x2.nxv4i32( [[TMP0]], [[TMP1]], i32 16) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint16_t test_svqrshrun_u16_x2(svint32x2_t zn) { + return SVE_ACLE_FUNC(svsqrshrun,_u16_x2,,,)(zn, 16); +} + +// SQRSHRUN x 4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqrshrun_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqrshrun.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], i32 32) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z21test_svsqrshrun_u8_x411svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqrshrun.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], i32 32) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint8_t test_svsqrshrun_u8_x4(svint32x4_t zn) { + return SVE_ACLE_FUNC(svsqrshrun,_u8_x4,,,)(zn, 32); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqrshrun_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqrshrun.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], i32 64) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z22test_svsqrshrun_u16_x411svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqrshrun.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], i32 64) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint16_t test_svsqrshrun_u16_x4(svint64x4_t zn) { + return SVE_ACLE_FUNC(svsqrshrun,_u16_x4,,,)(zn, 64); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_sclamp.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_sclamp.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_sclamp.c @@ -0,0 +1,344 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svclamp_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.sclamp.nxv16i8( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z15test_svclamp_s8u10__SVInt8_tu10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.sclamp.nxv16i8( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint8_t test_svclamp_s8(svint8_t op1, svint8_t op2, svint8_t op3) { + return SVE_ACLE_FUNC(svclamp, _s8, , )(op1, op2, op3); +} + +// CHECK-LABEL: @test_svclamp_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.sclamp.nxv8i16( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svclamp_s16u11__SVInt16_tu11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.sclamp.nxv8i16( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint16_t test_svclamp_s16(svint16_t op1, svint16_t op2, svint16_t op3) { + return SVE_ACLE_FUNC(svclamp, _s16, , )(op1, op2, op3); +} + +// CHECK-LABEL: @test_svclamp_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.sclamp.nxv4i32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svclamp_s32u11__SVInt32_tu11__SVInt32_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.sclamp.nxv4i32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint32_t test_svclamp_s32(svint32_t op1, svint32_t op2, svint32_t op3) { + return SVE_ACLE_FUNC(svclamp, _s32, , )(op1, op2, op3); +} + +// CHECK-LABEL: @test_svclamp_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.sclamp.nxv2i64( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svclamp_s64u11__SVInt64_tu11__SVInt64_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.sclamp.nxv2i64( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint64_t test_svclamp_s64(svint64_t op1, svint64_t op2, svint64_t op3) { + return SVE_ACLE_FUNC(svclamp, _s64, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[OP1]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.sclamp.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z25test_svclamp_single_s8_x210svint8x2_tu10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[OP1]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.sclamp.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint8x2_t test_svclamp_single_s8_x2(svint8x2_t op1, svint8_t op2, svint8_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_s8_x2, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[OP1]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.sclamp.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_s16_x211svint16x2_tu11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[OP1]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.sclamp.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint16x2_t test_svclamp_single_s16_x2(svint16x2_t op1, svint16_t op2, svint16_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_s16_x2, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[OP1]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.sclamp.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_s32_x211svint32x2_tu11__SVInt32_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[OP1]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.sclamp.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint32x2_t test_svclamp_single_s32_x2(svint32x2_t op1, svint32_t op2, svint32_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_s32_x2, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[OP1]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.sclamp.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_s64_x211svint64x2_tu11__SVInt64_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[OP1]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.sclamp.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint64x2_t test_svclamp_single_s64_x2(svint64x2_t op1, svint64_t op2, svint64_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_s64_x2, , )(op1, op2, op3); +} + + + + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[OP1]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[OP1]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[OP1]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.sclamp.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z25test_svclamp_single_s8_x410svint8x4_tu10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[OP1]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[OP1]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[OP1]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.sclamp.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint8x4_t test_svclamp_single_s8_x4(svint8x4_t op1, svint8_t op2, svint8_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_s8_x4, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP1]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP1]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP1]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.sclamp.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_s16_x411svint16x4_tu11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP1]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP1]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP1]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.sclamp.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint16x4_t test_svclamp_single_s16_x4(svint16x4_t op1, svint16_t op2, svint16_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_s16_x4, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP1]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP1]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP1]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.sclamp.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_s32_x411svint32x4_tu11__SVInt32_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP1]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP1]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP1]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.sclamp.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint32x4_t test_svclamp_single_s32_x4(svint32x4_t op1, svint32_t op2, svint32_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_s32_x4, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP1]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP1]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP1]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.sclamp.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_s64_x411svint64x4_tu11__SVInt64_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP1]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP1]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP1]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.sclamp.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint64x4_t test_svclamp_single_s64_x4(svint64x4_t op1, svint64_t op2, svint64_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_s64_x4, , )(op1, op2, op3); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_selx2.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_selx2.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_selx2.c @@ -0,0 +1,396 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED) A1 +#else +#define SVE_ACLE_FUNC(A1,A2) A1##A2 +#endif + +// 8-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z16test_svsel_s8_x2u11__SVCount_t10svint8x2_t10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint8x2_t test_svsel_s8_x2(svcount_t pn, svint8x2_t zn, svint8x2_t zm) { + return SVE_ACLE_FUNC(svsel,_s8_x2)(pn, zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z16test_svsel_u8_x2u11__SVCount_t11svuint8x2_t11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint8x2_t test_svsel_u8_x2(svcount_t pn, svuint8x2_t zn, svuint8x2_t zm) { + return SVE_ACLE_FUNC(svsel,_u8_x2)(pn, zn, zm); +} + +// 16-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_s16_x2u11__SVCount_t11svint16x2_t11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint16x2_t test_svsel_s16_x2(svcount_t pn, svint16x2_t zn, svint16x2_t zm) { + return SVE_ACLE_FUNC(svsel,_s16_x2)(pn, zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_u16_x2u11__SVCount_t12svuint16x2_t12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint16x2_t test_svsel_u16_x2(svcount_t pn, svuint16x2_t zn, svuint16x2_t zm) { + return SVE_ACLE_FUNC(svsel,_u16_x2)(pn, zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv8f16(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_f16_x2u11__SVCount_t13svfloat16x2_t13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv8f16(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat16x2_t test_svsel_f16_x2(svcount_t pn, svfloat16x2_t zn, svfloat16x2_t zm) { + return SVE_ACLE_FUNC(svsel,_f16_x2)(pn, zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_bf16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv8bf16(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svsel_bf16_x2u11__SVCount_t14svbfloat16x2_t14svbfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv8bf16(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svbfloat16x2_t test_svsel_bf16_x2(svcount_t pn, svbfloat16x2_t zn, svbfloat16x2_t zm) { + return SVE_ACLE_FUNC(svsel,_bf16_x2)(pn, zn, zm); +} + +// 32-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_s32_x2u11__SVCount_t11svint32x2_t11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint32x2_t test_svsel_s32_x2(svcount_t pn, svint32x2_t zn, svint32x2_t zm) { + return SVE_ACLE_FUNC(svsel,_s32_x2)(pn, zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_u32_x2u11__SVCount_t12svuint32x2_t12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint32x2_t test_svsel_u32_x2(svcount_t pn, svuint32x2_t zn, svuint32x2_t zm) { + return SVE_ACLE_FUNC(svsel,_u32_x2)(pn, zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv4f32(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_f32_x2u11__SVCount_t13svfloat32x2_t13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv4f32(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat32x2_t test_svsel_f32_x2(svcount_t pn, svfloat32x2_t zn, svfloat32x2_t zm) { + return SVE_ACLE_FUNC(svsel,_f32_x2)(pn, zn, zm); +} + +// 64-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_s64_x2u11__SVCount_t11svint64x2_t11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint64x2_t test_svsel_s64_x2(svcount_t pn, svint64x2_t zn, svint64x2_t zm) { + return SVE_ACLE_FUNC(svsel,_s64_x2)(pn, zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_u64_x2u11__SVCount_t12svuint64x2_t12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint64x2_t test_svsel_u64_x2(svcount_t pn, svuint64x2_t zn, svuint64x2_t zm) { + return SVE_ACLE_FUNC(svsel,_u64_x2)(pn, zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv2f64(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_f64_x2u11__SVCount_t13svfloat64x2_t13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv2f64(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat64x2_t test_svsel_f64_x2(svcount_t pn, svfloat64x2_t zn, svfloat64x2_t zm) { + return SVE_ACLE_FUNC(svsel,_f64_x2)(pn, zn, zm); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_selx4.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_selx4.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_selx4.c @@ -0,0 +1,588 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED) A1 +#else +#define SVE_ACLE_FUNC(A1,A2) A1##A2 +#endif + +// 8-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN1]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN1]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN1]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN2:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN2]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN2]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN2]], i64 48) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z16test_svsel_s8_x4u11__SVCount_t10svint8x4_t10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN1]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN1]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN1]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN2:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN2]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN2]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN2]], i64 48) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint8x4_t test_svsel_s8_x4(svcount_t pn, svint8x4_t zn1, svint8x4_t zn2) { + return SVE_ACLE_FUNC(svsel,_s8_x4)(pn, zn1, zn2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN1]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN1]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN1]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN2:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN2]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN2]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN2]], i64 48) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z16test_svsel_u8_x4u11__SVCount_t11svuint8x4_t11svuint8x4_t11svuint8x4_t11svuint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN1]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN1]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN1]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN2:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN2]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN2]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN2]], i64 48) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svuint8x4_t test_svsel_u8_x4(svcount_t pn, svuint8x4_t zn1, svuint8x4_t zn2, svuint8x4_t zn3, svuint8x4_t zn4) { + return SVE_ACLE_FUNC(svsel,_u8_x4)(pn, zn1, zn2); +} + +// 16-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN1]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN1]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN1]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN2:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN2]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN2]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN2]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_s16_x4u11__SVCount_t11svint16x4_t11svint16x4_t11svint16x4_t11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN1]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN1]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN1]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN2:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN2]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN2]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN2]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint16x4_t test_svsel_s16_x4(svcount_t pn, svint16x4_t zn1, svint16x4_t zn2, svint16x4_t zn3, svint16x4_t zn4) { + return SVE_ACLE_FUNC(svsel,_s16_x4)(pn, zn1, zn2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN1]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN1]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN1]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN2:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN2]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN2]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN2]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_u16_x4u11__SVCount_t12svuint16x4_t12svuint16x4_t12svuint16x4_t12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN1]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN1]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN1]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN2:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN2]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN2]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN2]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svuint16x4_t test_svsel_u16_x4(svcount_t pn, svuint16x4_t zn1, svuint16x4_t zn2, svuint16x4_t zn3, svuint16x4_t zn4) { + return SVE_ACLE_FUNC(svsel,_u16_x4)(pn, zn1, zn2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN1]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN1]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN1]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN2:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN2]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN2]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN2]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv8f16(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 8) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP12]], [[TMP13]], i64 16) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP14]], [[TMP15]], i64 24) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_f16_x4u11__SVCount_t13svfloat16x4_t13svfloat16x4_t13svfloat16x4_t13svfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN1]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN1]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN1]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN2:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN2]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN2]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN2]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv8f16(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 8) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP12]], [[TMP13]], i64 16) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP14]], [[TMP15]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat16x4_t test_svsel_f16_x4(svcount_t pn, svfloat16x4_t zn1, svfloat16x4_t zn2, svfloat16x4_t zn3, svfloat16x4_t zn4) { + return SVE_ACLE_FUNC(svsel,_f16_x4)(pn, zn1, zn2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_bf16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN1]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN1]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN1]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN2:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN2]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN2]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN2]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv8bf16(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP10]], [[TMP11]], i64 8) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP12]], [[TMP13]], i64 16) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP14]], [[TMP15]], i64 24) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z18test_svsel_bf16_x4u11__SVCount_t14svbfloat16x4_t14svbfloat16x4_t14svbfloat16x4_t14svbfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN1]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN1]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN1]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN2:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN2]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN2]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN2]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv8bf16(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP10]], [[TMP11]], i64 8) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP12]], [[TMP13]], i64 16) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP14]], [[TMP15]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svbfloat16x4_t test_svsel_bf16_x4(svcount_t pn, svbfloat16x4_t zn1, svbfloat16x4_t zn2, svbfloat16x4_t zn3, svbfloat16x4_t zn4) { + return SVE_ACLE_FUNC(svsel,_bf16_x4)(pn, zn1, zn2); +} + +// 32-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN1]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN1]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN1]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN2:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN2]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN2]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN2]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_s32_x4u11__SVCount_t11svint32x4_t11svint32x4_t11svint32x4_t11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN1]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN1]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN1]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN2:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN2]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN2]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN2]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint32x4_t test_svsel_s32_x4(svcount_t pn, svint32x4_t zn1, svint32x4_t zn2, svint32x4_t zn3, svint32x4_t zn4) { + return SVE_ACLE_FUNC(svsel,_s32_x4)(pn, zn1, zn2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN1]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN1]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN1]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN2:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN2]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN2]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN2]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_u32_x4u11__SVCount_t12svuint32x4_t12svuint32x4_t12svuint32x4_t12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN1]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN1]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN1]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN2:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN2]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN2]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN2]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svuint32x4_t test_svsel_u32_x4(svcount_t pn, svuint32x4_t zn1, svuint32x4_t zn2, svuint32x4_t zn3, svuint32x4_t zn4) { + return SVE_ACLE_FUNC(svsel,_u32_x4)(pn, zn1, zn2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN1]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN1]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN1]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN2:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN2]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN2]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN2]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv4f32(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 4) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP12]], [[TMP13]], i64 8) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP14]], [[TMP15]], i64 12) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_f32_x4u11__SVCount_t13svfloat32x4_t13svfloat32x4_t13svfloat32x4_t13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN1]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN1]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN1]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN2:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN2]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN2]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN2]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv4f32(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 4) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP12]], [[TMP13]], i64 8) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP14]], [[TMP15]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat32x4_t test_svsel_f32_x4(svcount_t pn, svfloat32x4_t zn1, svfloat32x4_t zn2, svfloat32x4_t zn3, svfloat32x4_t zn4) { + return SVE_ACLE_FUNC(svsel,_f32_x4)(pn, zn1, zn2); +} + +// 64-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN1]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN1]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN1]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN2:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN2]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN2]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN2]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_s64_x4u11__SVCount_t11svint64x4_t11svint64x4_t11svint64x4_t11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN1]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN1]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN1]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN2:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN2]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN2]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN2]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint64x4_t test_svsel_s64_x4(svcount_t pn, svint64x4_t zn1, svint64x4_t zn2, svint64x4_t zn3, svint64x4_t zn4) { + return SVE_ACLE_FUNC(svsel,_s64_x4)(pn, zn1, zn2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN1]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN1]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN1]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN2:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN2]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN2]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN2]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_u64_x4u11__SVCount_t12svuint64x4_t12svuint64x4_t12svuint64x4_t12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN1]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN1]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN1]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN2:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN2]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN2]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN2]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svuint64x4_t test_svsel_u64_x4(svcount_t pn, svuint64x4_t zn1, svuint64x4_t zn2, svuint64x4_t zn3, svuint64x4_t zn4) { + return SVE_ACLE_FUNC(svsel,_u64_x4)(pn, zn1, zn2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN1]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN1]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN1]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN2:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN2]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN2]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN2]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv2f64(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 2) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP12]], [[TMP13]], i64 4) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP14]], [[TMP15]], i64 6) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_f64_x4u11__SVCount_t13svfloat64x4_t13svfloat64x4_t13svfloat64x4_t13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN1]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN1]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN1]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN2:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN2]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN2]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN2]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv2f64(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 2) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP12]], [[TMP13]], i64 4) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP14]], [[TMP15]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat64x4_t test_svsel_f64_x4(svcount_t pn, svfloat64x4_t zn1, svfloat64x4_t zn2, svfloat64x4_t zn3, svfloat64x4_t zn4) { + return SVE_ACLE_FUNC(svsel,_f64_x4)(pn, zn1, zn2); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1.c @@ -0,0 +1,1042 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V]], i64 16) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv16i8( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z16test_svst1_u8_x2u11__SVCount_tPh11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V]], i64 16) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv16i8( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_u8_x2(svcount_t pn, uint8_t *base, svuint8x2_t v) +{ + return SVE_ACLE_FUNC(svst1,_u8_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V]], i64 8) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv8i16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_u16_x2u11__SVCount_tPt12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V]], i64 8) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv8i16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_u16_x2(svcount_t pn, uint16_t *base, svuint16x2_t v) +{ + return SVE_ACLE_FUNC(svst1,_u16_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv4i32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_u32_x2u11__SVCount_tPj12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv4i32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_u32_x2(svcount_t pn, uint32_t *base, svuint32x2_t v) +{ + return SVE_ACLE_FUNC(svst1,_u32_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv2i64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_u64_x2u11__SVCount_tPm12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv2i64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_u64_x2(svcount_t pn, uint64_t *base, svuint64x2_t v) +{ + return SVE_ACLE_FUNC(svst1,_u64_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 48) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z16test_svst1_u8_x4u11__SVCount_tPh11svuint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 48) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_u8_x4(svcount_t pn, uint8_t *base, svuint8x4_t v) +{ + return SVE_ACLE_FUNC(svst1,_u8_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 24) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_u16_x4u11__SVCount_tPt12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 24) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_u16_x4(svcount_t pn, uint16_t *base, svuint16x4_t v) +{ + return SVE_ACLE_FUNC(svst1,_u16_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_u32_x4u11__SVCount_tPj12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_u32_x4(svcount_t pn, uint32_t *base, svuint32x4_t v) +{ + return SVE_ACLE_FUNC(svst1,_u32_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_u64_x4u11__SVCount_tPm12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_u64_x4(svcount_t pn, uint64_t *base, svuint64x4_t v) +{ + return SVE_ACLE_FUNC(svst1,_u64_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V]], i64 16) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv16i8( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z16test_svst1_s8_x2u11__SVCount_tPa10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V]], i64 16) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv16i8( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_s8_x2(svcount_t pn, int8_t *base, svint8x2_t v) +{ + return SVE_ACLE_FUNC(svst1,_s8_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V]], i64 8) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv8i16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_s16_x2u11__SVCount_tPs11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V]], i64 8) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv8i16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_s16_x2(svcount_t pn, int16_t *base, svint16x2_t v) +{ + return SVE_ACLE_FUNC(svst1,_s16_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv4i32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_s32_x2u11__SVCount_tPi11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv4i32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_s32_x2(svcount_t pn, int32_t *base, svint32x2_t v) +{ + return SVE_ACLE_FUNC(svst1,_s32_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv2i64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_s64_x2u11__SVCount_tPl11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv2i64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_s64_x2(svcount_t pn, int64_t *base, svint64x2_t v) +{ + return SVE_ACLE_FUNC(svst1,_s64_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 48) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z16test_svst1_s8_x4u11__SVCount_tPa10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 48) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_s8_x4(svcount_t pn, int8_t *base, svint8x4_t v) +{ + return SVE_ACLE_FUNC(svst1,_s8_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 24) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_s16_x4u11__SVCount_tPs11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 24) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_s16_x4(svcount_t pn, int16_t *base, svint16x4_t v) +{ + return SVE_ACLE_FUNC(svst1,_s16_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_s32_x4u11__SVCount_tPi11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_s32_x4(svcount_t pn, int32_t *base, svint32x4_t v) +{ + return SVE_ACLE_FUNC(svst1,_s32_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_s64_x4u11__SVCount_tPl11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_s64_x4(svcount_t pn, int64_t *base, svint64x4_t v) +{ + return SVE_ACLE_FUNC(svst1,_s64_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[V]], i64 8) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv8f16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_f16_x2u11__SVCount_tPDh13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[V]], i64 8) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv8f16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_f16_x2(svcount_t pn, float16_t *base, svfloat16x2_t v) +{ + return SVE_ACLE_FUNC(svst1,_f16_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[V]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv4f32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_f32_x2u11__SVCount_tPf13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[V]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv4f32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_f32_x2(svcount_t pn, float32_t *base, svfloat32x2_t v) +{ + return SVE_ACLE_FUNC(svst1,_f32_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[V]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv2f64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_f64_x2u11__SVCount_tPd13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[V]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv2f64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_f64_x2(svcount_t pn, float64_t *base, svfloat64x2_t v) +{ + return SVE_ACLE_FUNC(svst1,_f64_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 24) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_f16_x4u11__SVCount_tPDh13svfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 24) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_f16_x4(svcount_t pn, float16_t *base, svfloat16x4_t v) +{ + return SVE_ACLE_FUNC(svst1,_f16_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_f32_x4u11__SVCount_tPf13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_f32_x4(svcount_t pn, float32_t *base, svfloat32x4_t v) +{ + return SVE_ACLE_FUNC(svst1,_f32_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_f64_x4u11__SVCount_tPd13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_f64_x4(svcount_t pn, float64_t *base, svfloat64x4_t v) +{ + return SVE_ACLE_FUNC(svst1,_f64_x4,,)(pn, base, v); +} + + +// == VNUM variants == + + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv16i8( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svst1_vnum_u8_x2u11__SVCount_tPhl11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv16i8( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_u8_x2(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x2_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_u8_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv8i16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_u16_x2u11__SVCount_tPtl12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv8i16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_u16_x2(svcount_t pn, uint16_t *base, int64_t vnum, svuint16x2_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_u16_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv4i32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_u32_x2u11__SVCount_tPjl12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv4i32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_u32_x2(svcount_t pn, uint32_t *base, int64_t vnum, svuint32x2_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_u32_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv2i64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_u64_x2u11__SVCount_tPml12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv2i64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_u64_x2(svcount_t pn, uint64_t *base, int64_t vnum, svuint64x2_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_u64_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svst1_vnum_u8_x4u11__SVCount_tPhl11svuint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_u8_x4(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x4_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_u8_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_u16_x4u11__SVCount_tPtl12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_u16_x4(svcount_t pn, uint16_t *base, int64_t vnum, svuint16x4_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_u16_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_u32_x4u11__SVCount_tPjl12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_u32_x4(svcount_t pn, uint32_t *base, int64_t vnum, svuint32x4_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_u32_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_u64_x4u11__SVCount_tPml12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_u64_x4(svcount_t pn, uint64_t *base, int64_t vnum, svuint64x4_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_u64_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv16i8( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svst1_vnum_s8_x2u11__SVCount_tPal10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv16i8( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_s8_x2(svcount_t pn, int8_t *base, int64_t vnum, svint8x2_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_s8_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv8i16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_s16_x2u11__SVCount_tPsl11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv8i16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_s16_x2(svcount_t pn, int16_t *base, int64_t vnum, svint16x2_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_s16_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv4i32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_s32_x2u11__SVCount_tPil11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv4i32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_s32_x2(svcount_t pn, int32_t *base, int64_t vnum, svint32x2_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_s32_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv2i64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_s64_x2u11__SVCount_tPll11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv2i64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_s64_x2(svcount_t pn, int64_t *base, int64_t vnum, svint64x2_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_s64_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svst1_vnum_s8_x4u11__SVCount_tPal10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_s8_x4(svcount_t pn, int8_t *base, int64_t vnum, svint8x4_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_s8_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_s16_x4u11__SVCount_tPsl11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_s16_x4(svcount_t pn, int16_t *base, int64_t vnum, svint16x4_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_s16_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_s32_x4u11__SVCount_tPil11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_s32_x4(svcount_t pn, int32_t *base, int64_t vnum, svint32x4_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_s32_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_s64_x4u11__SVCount_tPll11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_s64_x4(svcount_t pn, int64_t *base, int64_t vnum, svint64x4_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_s64_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv8f16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_f16_x2u11__SVCount_tPDhd13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv8f16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_f16_x2(svcount_t pn, float16_t *base, float64_t vnum, svfloat16x2_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_f16_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv4f32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_f32_x2u11__SVCount_tPfd13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv4f32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_f32_x2(svcount_t pn, float32_t *base, float64_t vnum, svfloat32x2_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_f32_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv2f64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_f64_x2u11__SVCount_tPdd13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg2.nxv2f64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_f64_x2(svcount_t pn, float64_t *base, float64_t vnum, svfloat64x2_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_f64_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_f16_x4u11__SVCount_tPDhd13svfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_f16_x4(svcount_t pn, float16_t *base, float64_t vnum, svfloat16x4_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_f16_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_f32_x4u11__SVCount_tPfd13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_f32_x4(svcount_t pn, float32_t *base, float64_t vnum, svfloat32x4_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_f32_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_f64_x4u11__SVCount_tPdd13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.vg4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_f64_x4(svcount_t pn, float64_t *base, float64_t vnum, svfloat64x4_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_f64_x4,,)(pn, base, vnum, v); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_stnt1.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_stnt1.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_stnt1.c @@ -0,0 +1,1042 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V]], i64 16) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv16i8( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z18test_svstnt1_u8_x2u11__SVCount_tPh11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V]], i64 16) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv16i8( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_u8_x2(svcount_t pn, uint8_t *base, svuint8x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_u8_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V]], i64 8) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv8i16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_u16_x2u11__SVCount_tPt12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V]], i64 8) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv8i16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_u16_x2(svcount_t pn, uint16_t *base, svuint16x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_u16_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv4i32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_u32_x2u11__SVCount_tPj12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv4i32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_u32_x2(svcount_t pn, uint32_t *base, svuint32x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_u32_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv2i64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_u64_x2u11__SVCount_tPm12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv2i64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_u64_x2(svcount_t pn, uint64_t *base, svuint64x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_u64_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 48) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z18test_svstnt1_u8_x4u11__SVCount_tPh11svuint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 48) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_u8_x4(svcount_t pn, uint8_t *base, svuint8x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_u8_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 24) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_u16_x4u11__SVCount_tPt12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 24) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_u16_x4(svcount_t pn, uint16_t *base, svuint16x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_u16_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_u32_x4u11__SVCount_tPj12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_u32_x4(svcount_t pn, uint32_t *base, svuint32x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_u32_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_u64_x4u11__SVCount_tPm12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_u64_x4(svcount_t pn, uint64_t *base, svuint64x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_u64_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V]], i64 16) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv16i8( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z18test_svstnt1_s8_x2u11__SVCount_tPa10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V]], i64 16) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv16i8( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_s8_x2(svcount_t pn, int8_t *base, svint8x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_s8_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V]], i64 8) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv8i16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_s16_x2u11__SVCount_tPs11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V]], i64 8) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv8i16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_s16_x2(svcount_t pn, int16_t *base, svint16x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_s16_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv4i32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_s32_x2u11__SVCount_tPi11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv4i32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_s32_x2(svcount_t pn, int32_t *base, svint32x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_s32_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv2i64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_s64_x2u11__SVCount_tPl11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv2i64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_s64_x2(svcount_t pn, int64_t *base, svint64x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_s64_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 48) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z18test_svstnt1_s8_x4u11__SVCount_tPa10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 48) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_s8_x4(svcount_t pn, int8_t *base, svint8x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_s8_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 24) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_s16_x4u11__SVCount_tPs11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 24) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_s16_x4(svcount_t pn, int16_t *base, svint16x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_s16_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_s32_x4u11__SVCount_tPi11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_s32_x4(svcount_t pn, int32_t *base, svint32x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_s32_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_s64_x4u11__SVCount_tPl11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_s64_x4(svcount_t pn, int64_t *base, svint64x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_s64_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[V]], i64 8) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv8f16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_f16_x2u11__SVCount_tPDh13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[V]], i64 8) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv8f16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_f16_x2(svcount_t pn, float16_t *base, svfloat16x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_f16_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[V]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv4f32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_f32_x2u11__SVCount_tPf13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[V]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv4f32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_f32_x2(svcount_t pn, float32_t *base, svfloat32x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_f32_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[V]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv2f64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_f64_x2u11__SVCount_tPd13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[V]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv2f64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_f64_x2(svcount_t pn, float64_t *base, svfloat64x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_f64_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 24) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_f16_x4u11__SVCount_tPDh13svfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 24) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_f16_x4(svcount_t pn, float16_t *base, svfloat16x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_f16_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_f32_x4u11__SVCount_tPf13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_f32_x4(svcount_t pn, float32_t *base, svfloat32x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_f32_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_f64_x4u11__SVCount_tPd13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_f64_x4(svcount_t pn, float64_t *base, svfloat64x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_f64_x4,,)(pn, base, v); +} + + +// == VNUM variants == + + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv16i8( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svstnt1_vnum_u8_x2u11__SVCount_tPhl11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv16i8( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_u8_x2(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_u8_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv8i16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_u16_x2u11__SVCount_tPtl12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv8i16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_u16_x2(svcount_t pn, uint16_t *base, int64_t vnum, svuint16x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_u16_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv4i32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_u32_x2u11__SVCount_tPjl12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv4i32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_u32_x2(svcount_t pn, uint32_t *base, int64_t vnum, svuint32x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_u32_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv2i64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_u64_x2u11__SVCount_tPml12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv2i64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_u64_x2(svcount_t pn, uint64_t *base, int64_t vnum, svuint64x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_u64_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svstnt1_vnum_u8_x4u11__SVCount_tPhl11svuint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_u8_x4(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_u8_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_u16_x4u11__SVCount_tPtl12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_u16_x4(svcount_t pn, uint16_t *base, int64_t vnum, svuint16x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_u16_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_u32_x4u11__SVCount_tPjl12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_u32_x4(svcount_t pn, uint32_t *base, int64_t vnum, svuint32x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_u32_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_u64_x4u11__SVCount_tPml12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_u64_x4(svcount_t pn, uint64_t *base, int64_t vnum, svuint64x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_u64_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv16i8( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svstnt1_vnum_s8_x2u11__SVCount_tPal10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv16i8( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_s8_x2(svcount_t pn, int8_t *base, int64_t vnum, svint8x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_s8_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv8i16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_s16_x2u11__SVCount_tPsl11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv8i16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_s16_x2(svcount_t pn, int16_t *base, int64_t vnum, svint16x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_s16_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv4i32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_s32_x2u11__SVCount_tPil11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv4i32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_s32_x2(svcount_t pn, int32_t *base, int64_t vnum, svint32x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_s32_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv2i64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_s64_x2u11__SVCount_tPll11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv2i64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_s64_x2(svcount_t pn, int64_t *base, int64_t vnum, svint64x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_s64_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svstnt1_vnum_s8_x4u11__SVCount_tPal10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_s8_x4(svcount_t pn, int8_t *base, int64_t vnum, svint8x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_s8_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_s16_x4u11__SVCount_tPsl11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_s16_x4(svcount_t pn, int16_t *base, int64_t vnum, svint16x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_s16_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_s32_x4u11__SVCount_tPil11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_s32_x4(svcount_t pn, int32_t *base, int64_t vnum, svint32x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_s32_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_s64_x4u11__SVCount_tPll11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_s64_x4(svcount_t pn, int64_t *base, int64_t vnum, svint64x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_s64_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv8f16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_f16_x2u11__SVCount_tPDhd13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv8f16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_f16_x2(svcount_t pn, float16_t *base, float64_t vnum, svfloat16x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_f16_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv4f32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_f32_x2u11__SVCount_tPfd13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv4f32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_f32_x2(svcount_t pn, float32_t *base, float64_t vnum, svfloat32x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_f32_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv2f64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_f64_x2u11__SVCount_tPdd13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv2f64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_f64_x2(svcount_t pn, float64_t *base, float64_t vnum, svfloat64x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_f64_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_f16_x4u11__SVCount_tPDhd13svfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_f16_x4(svcount_t pn, float16_t *base, float64_t vnum, svfloat16x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_f16_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_f32_x4u11__SVCount_tPfd13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_f32_x4(svcount_t pn, float32_t *base, float64_t vnum, svfloat32x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_f32_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_f64_x4u11__SVCount_tPdd13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_f64_x4(svcount_t pn, float64_t *base, float64_t vnum, svfloat64x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_f64_x4,,)(pn, base, vnum, v); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uclamp.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uclamp.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uclamp.c @@ -0,0 +1,344 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svclamp_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uclamp.nxv16i8( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z15test_svclamp_u8u11__SVUint8_tu11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uclamp.nxv16i8( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint8_t test_svclamp_u8(svuint8_t op1, svuint8_t op2, svuint8_t op3) { + return SVE_ACLE_FUNC(svclamp, _u8, , )(op1, op2, op3); +} + +// CHECK-LABEL: @test_svclamp_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uclamp.nxv8i16( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svclamp_u16u12__SVUint16_tu12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uclamp.nxv8i16( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint16_t test_svclamp_u16(svuint16_t op1, svuint16_t op2, svuint16_t op3) { + return SVE_ACLE_FUNC(svclamp, _u16, , )(op1, op2, op3); +} + +// CHECK-LABEL: @test_svclamp_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uclamp.nxv4i32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svclamp_u32u12__SVUint32_tu12__SVUint32_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uclamp.nxv4i32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint32_t test_svclamp_u32(svuint32_t op1, svuint32_t op2, svuint32_t op3) { + return SVE_ACLE_FUNC(svclamp, _u32, , )(op1, op2, op3); +} + +// CHECK-LABEL: @test_svclamp_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uclamp.nxv2i64( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svclamp_u64u12__SVUint64_tu12__SVUint64_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uclamp.nxv2i64( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint64_t test_svclamp_u64(svuint64_t op1, svuint64_t op2, svuint64_t op3) { + return SVE_ACLE_FUNC(svclamp, _u64, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[OP1]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.uclamp.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z25test_svclamp_single_u8_x211svuint8x2_tu11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[OP1]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.uclamp.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint8x2_t test_svclamp_single_u8_x2(svuint8x2_t op1, svuint8_t op2, svuint8_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_u8_x2, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[OP1]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.uclamp.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_u16_x212svuint16x2_tu12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[OP1]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.uclamp.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint16x2_t test_svclamp_single_u16_x2(svuint16x2_t op1, svuint16_t op2, svuint16_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_u16_x2, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[OP1]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.uclamp.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_u32_x212svuint32x2_tu12__SVUint32_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[OP1]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.uclamp.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint32x2_t test_svclamp_single_u32_x2(svuint32x2_t op1, svuint32_t op2, svuint32_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_u32_x2, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[OP1]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.uclamp.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_u64_x212svuint64x2_tu12__SVUint64_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[OP1]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.uclamp.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint64x2_t test_svclamp_single_u64_x2(svuint64x2_t op1, svuint64_t op2, svuint64_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_u64_x2, , )(op1, op2, op3); +} + + + + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[OP1]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[OP1]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[OP1]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.uclamp.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z25test_svclamp_single_u8_x411svuint8x4_tu11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[OP1]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[OP1]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[OP1]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.uclamp.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint8x4_t test_svclamp_single_u8_x4(svuint8x4_t op1, svuint8_t op2, svuint8_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_u8_x4, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP1]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP1]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP1]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.uclamp.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_u16_x412svuint16x4_tu12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP1]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP1]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP1]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.uclamp.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint16x4_t test_svclamp_single_u16_x4(svuint16x4_t op1, svuint16_t op2, svuint16_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_u16_x4, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP1]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP1]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP1]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.uclamp.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_u32_x412svuint32x4_tu12__SVUint32_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP1]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP1]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP1]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.uclamp.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint32x4_t test_svclamp_single_u32_x4(svuint32x4_t op1, svuint32_t op2, svuint32_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_u32_x4, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP1]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP1]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP1]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.uclamp.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_u64_x412svuint64x4_tu12__SVUint64_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP1]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP1]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP1]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.uclamp.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint64x4_t test_svclamp_single_u64_x4(svuint64x4_t op1, svuint64_t op2, svuint64_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_u64_x4, , )(op1, op2, op3); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_unpkx2.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_unpkx2.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_unpkx2.c @@ -0,0 +1,155 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED) A1 +#else +#define SVE_ACLE_FUNC(A1,A2) A1##A2 +#endif + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svunpk_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.sunpk.x2.nxv8i16( [[ZN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svunpk_s16_x2u10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.sunpk.x2.nxv8i16( [[ZN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint16x2_t test_svunpk_s16_x2(svint8_t zn) { + return SVE_ACLE_FUNC(svunpk,_s16_x2)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svunpk_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uunpk.x2.nxv8i16( [[ZN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svunpk_u16_x2u11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uunpk.x2.nxv8i16( [[ZN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint16x2_t test_svunpk_u16_x2(svuint8_t zn) { + return SVE_ACLE_FUNC(svunpk,_u16_x2)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svunpk_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.sunpk.x2.nxv4i32( [[ZN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svunpk_s32_x2u11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.sunpk.x2.nxv4i32( [[ZN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint32x2_t test_svunpk_s32_x2(svint16_t zn) { + return SVE_ACLE_FUNC(svunpk,_s32_x2)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svunpk_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uunpk.x2.nxv4i32( [[ZN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svunpk_u32_x2u12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uunpk.x2.nxv4i32( [[ZN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint32x2_t test_svunpk_u32_x2(svuint16_t zn) { + return SVE_ACLE_FUNC(svunpk,_u32_x2)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svunpk_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.sunpk.x2.nxv2i64( [[ZN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svunpk_s64_x2u11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.sunpk.x2.nxv2i64( [[ZN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint64x2_t test_svunpk_s64_x2(svint32_t zn) { + return SVE_ACLE_FUNC(svunpk,_s64_x2)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svunpk_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uunpk.x2.nxv2i64( [[ZN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svunpk_u64_x2u12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uunpk.x2.nxv2i64( [[ZN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint64x2_t test_svunpk_u64_x2(svuint32_t zn) { + return SVE_ACLE_FUNC(svunpk,_u64_x2)(zn); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_unpkx4.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_unpkx4.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_unpkx4.c @@ -0,0 +1,227 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED) A1 +#else +#define SVE_ACLE_FUNC(A1,A2) A1##A2 +#endif + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svunpk_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.sunpk.x4.nxv8i16( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 24) +// CHECK-NEXT: ret [[TMP10]] +// +// CPP-CHECK-LABEL: @_Z18test_svunpk_s16_x410svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.sunpk.x4.nxv8i16( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP10]] +// +svint16x4_t test_svunpk_s16_x4(svint8x2_t zn) { + return SVE_ACLE_FUNC(svunpk,_s16_x4)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svunpk_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.uunpk.x4.nxv8i16( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 24) +// CHECK-NEXT: ret [[TMP10]] +// +// CPP-CHECK-LABEL: @_Z18test_svunpk_u16_x411svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.uunpk.x4.nxv8i16( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP10]] +// +svuint16x4_t test_svunpk_u16_x4(svuint8x2_t zn) { + return SVE_ACLE_FUNC(svunpk,_u16_x4)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svunpk_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.sunpk.x4.nxv4i32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 12) +// CHECK-NEXT: ret [[TMP10]] +// +// CPP-CHECK-LABEL: @_Z18test_svunpk_s32_x411svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.sunpk.x4.nxv4i32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP10]] +// +svint32x4_t test_svunpk_s32_x4(svint16x2_t zn) { + return SVE_ACLE_FUNC(svunpk,_s32_x4)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svunpk_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.uunpk.x4.nxv4i32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 12) +// CHECK-NEXT: ret [[TMP10]] +// +// CPP-CHECK-LABEL: @_Z18test_svunpk_u32_x412svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.uunpk.x4.nxv4i32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP10]] +// +svuint32x4_t test_svunpk_u32_x4(svuint16x2_t zn) { + return SVE_ACLE_FUNC(svunpk,_u32_x4)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svunpk_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.sunpk.x4.nxv2i64( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 6) +// CHECK-NEXT: ret [[TMP10]] +// +// CPP-CHECK-LABEL: @_Z18test_svunpk_s64_x411svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.sunpk.x4.nxv2i64( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP10]] +// +svint64x4_t test_svunpk_s64_x4(svint32x2_t zn) { + return SVE_ACLE_FUNC(svunpk,_s64_x4)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svunpk_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.uunpk.x4.nxv2i64( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 6) +// CHECK-NEXT: ret [[TMP10]] +// +// CPP-CHECK-LABEL: @_Z18test_svunpk_u64_x412svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.uunpk.x4.nxv2i64( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP10]] +// +svuint64x4_t test_svunpk_u64_x4(svuint32x2_t zn) { + return SVE_ACLE_FUNC(svunpk,_u64_x4)(zn); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uzpx2.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uzpx2.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uzpx2.c @@ -0,0 +1,579 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// 8-bit UZPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv16i8( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z16test_svuzp_s8_x2u10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv16i8( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint8x2_t test_svuzp_s8_x2(svint8_t zn, svint8_t zm) { + return SVE_ACLE_FUNC(svuzp,_s8,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv16i8( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z16test_svuzp_u8_x2u11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv16i8( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint8x2_t test_svuzp_u8_x2(svuint8_t zn, svuint8_t zm) { + return SVE_ACLE_FUNC(svuzp,_u8,,,_x2)(zn, zm); +} + +// 16-bit UZPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv8i16( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_s16_x2u11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv8i16( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint16x2_t test_svuzp_s16_x2(svint16_t zn, svint16_t zm) { + return SVE_ACLE_FUNC(svuzp,_s16,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv8i16( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_u16_x2u12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv8i16( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint16x2_t test_svuzp_u16_x2(svuint16_t zn, svuint16_t zm) { + return SVE_ACLE_FUNC(svuzp,_u16,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv8f16( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_f16_x2u13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv8f16( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat16x2_t test_svuzp_f16_x2(svfloat16_t zn, svfloat16_t zm) { + return SVE_ACLE_FUNC(svuzp,_f16,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_bf16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv8bf16( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzp_bf16_x2u14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv8bf16( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svbfloat16x2_t test_svuzp_bf16_x2(svbfloat16_t zn, svbfloat16_t zm) { + return SVE_ACLE_FUNC(svuzp,_bf16,,,_x2)(zn, zm); +} + +// 32-bit UZPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv4i32( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_s32_x2u11__SVInt32_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv4i32( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint32x2_t test_svuzp_s32_x2(svint32_t zn, svint32_t zm) { + return SVE_ACLE_FUNC(svuzp,_s32,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv4i32( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_u32_x2u12__SVUint32_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv4i32( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint32x2_t test_svuzp_u32_x2(svuint32_t zn, svuint32_t zm) { + return SVE_ACLE_FUNC(svuzp,_u32,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv4f32( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_f32_x2u13__SVFloat32_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv4f32( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat32x2_t test_svuzp_f32_x2(svfloat32_t zn, svfloat32_t zm) { + return SVE_ACLE_FUNC(svuzp,_f32,,,_x2)(zn, zm); +} + +// 64-bit UZPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv2i64( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_s64_x2u11__SVInt64_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv2i64( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint64x2_t test_svuzp_s64_x2(svint64_t zn, svint64_t zm) { + return SVE_ACLE_FUNC(svuzp,_s64,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv2i64( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_u64_x2u12__SVUint64_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv2i64( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint64x2_t test_svuzp_u64_x2(svuint64_t zn, svuint64_t zm) { + return SVE_ACLE_FUNC(svuzp,_u64,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv2f64( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_f64_x2u13__SVFloat64_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv2f64( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat64x2_t test_svuzp_f64_x2(svfloat64_t zn, svfloat64_t zm) { + return SVE_ACLE_FUNC(svuzp,_f64,,,_x2)(zn, zm); +} + +// 128-bit UZPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv16i8( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzpq_s8_x2u10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv16i8( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint8x2_t test_svuzpq_s8_x2(svint8_t zn, svint8_t zm) { + return SVE_ACLE_FUNC(svuzpq,_s8,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv16i8( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzpq_u8_x2u11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv16i8( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint8x2_t test_svuzpq_u8_x2(svuint8_t zn, svuint8_t zm) { + return SVE_ACLE_FUNC(svuzpq,_u8,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv8i16( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_s16_x2u11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv8i16( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint16x2_t test_svuzpq_s16_x2(svint16_t zn, svint16_t zm) { + return SVE_ACLE_FUNC(svuzpq,_s16,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv8i16( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_u16_x2u12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv8i16( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint16x2_t test_svuzpq_u16_x2(svuint16_t zn, svuint16_t zm) { + return SVE_ACLE_FUNC(svuzpq,_u16,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv8f16( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_f16_x2u13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv8f16( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat16x2_t test_svuzpq_f16_x2(svfloat16_t zn, svfloat16_t zm) { + return SVE_ACLE_FUNC(svuzpq,_f16,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_bf16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv8bf16( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z19test_svuzpq_bf16_x2u14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv8bf16( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svbfloat16x2_t test_svuzpq_bf16_x2(svbfloat16_t zn, svbfloat16_t zm) { + return SVE_ACLE_FUNC(svuzpq,_bf16,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv4i32( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_s32_x2u11__SVInt32_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv4i32( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint32x2_t test_svuzpq_s32_x2(svint32_t zn, svint32_t zm) { + return SVE_ACLE_FUNC(svuzpq,_s32,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv4i32( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_u32_x2u12__SVUint32_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv4i32( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint32x2_t test_svuzpq_u32_x2(svuint32_t zn, svuint32_t zm) { + return SVE_ACLE_FUNC(svuzpq,_u32,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv4f32( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_f32_x2u13__SVFloat32_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv4f32( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat32x2_t test_svuzpq_f32_x2(svfloat32_t zn, svfloat32_t zm) { + return SVE_ACLE_FUNC(svuzpq,_f32,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv2i64( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_s64_x2u11__SVInt64_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv2i64( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint64x2_t test_svuzpq_s64_x2(svint64_t zn, svint64_t zm) { + return SVE_ACLE_FUNC(svuzpq,_s64,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv2i64( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_u64_x2u12__SVUint64_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv2i64( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint64x2_t test_svuzpq_u64_x2(svuint64_t zn, svuint64_t zm) { + return SVE_ACLE_FUNC(svuzpq,_u64,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv2f64( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_f64_x2u13__SVFloat64_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv2f64( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat64x2_t test_svuzpq_f64_x2(svfloat64_t zn, svfloat64_t zm) { + return SVE_ACLE_FUNC(svuzpq,_f64,,,_x2)(zn, zm); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uzpx4.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uzpx4.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uzpx4.c @@ -0,0 +1,771 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// 8-bit UZPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv16i8( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z16test_svuzp_s8_x4u10__SVInt8_tu10__SVInt8_tu10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv16i8( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint8x4_t test_svuzp_s8_x4(svint8_t zn1, svint8_t zn2, svint8_t zn3, svint8_t zn4) { + return SVE_ACLE_FUNC(svuzp,_s8,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv16i8( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z16test_svuzp_u8_x4u11__SVUint8_tu11__SVUint8_tu11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv16i8( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint8x4_t test_svuzp_u8_x4(svuint8_t zn1, svuint8_t zn2, svuint8_t zn3, svuint8_t zn4) { + return SVE_ACLE_FUNC(svuzp,_u8,,,_x4)(zn1, zn2, zn3, zn4); +} + +// 16-bit UZPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv8i16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_s16_x4u11__SVInt16_tu11__SVInt16_tu11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv8i16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint16x4_t test_svuzp_s16_x4(svint16_t zn1, svint16_t zn2, svint16_t zn3, svint16_t zn4) { + return SVE_ACLE_FUNC(svuzp,_s16,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv8i16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_u16_x4u12__SVUint16_tu12__SVUint16_tu12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv8i16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint16x4_t test_svuzp_u16_x4(svuint16_t zn1, svuint16_t zn2, svuint16_t zn3, svuint16_t zn4) { + return SVE_ACLE_FUNC(svuzp,_u16,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv8f16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_f16_x4u13__SVFloat16_tu13__SVFloat16_tu13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv8f16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat16x4_t test_svuzp_f16_x4(svfloat16_t zn1, svfloat16_t zn2, svfloat16_t zn3, svfloat16_t zn4) { + return SVE_ACLE_FUNC(svuzp,_f16,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_bf16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv8bf16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzp_bf16_x4u14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv8bf16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svbfloat16x4_t test_svuzp_bf16_x4(svbfloat16_t zn1, svbfloat16_t zn2, svbfloat16_t zn3, svbfloat16_t zn4) { + return SVE_ACLE_FUNC(svuzp,_bf16,,,_x4)(zn1, zn2, zn3, zn4); +} + +// 32-bit UZPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv4i32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_s32_x4u11__SVInt32_tu11__SVInt32_tu11__SVInt32_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv4i32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint32x4_t test_svuzp_s32_x4(svint32_t zn1, svint32_t zn2, svint32_t zn3, svint32_t zn4) { + return SVE_ACLE_FUNC(svuzp,_s32,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv4i32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_u32_x4u12__SVUint32_tu12__SVUint32_tu12__SVUint32_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv4i32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint32x4_t test_svuzp_u32_x4(svuint32_t zn1, svuint32_t zn2, svuint32_t zn3, svuint32_t zn4) { + return SVE_ACLE_FUNC(svuzp,_u32,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv4f32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_f32_x4u13__SVFloat32_tu13__SVFloat32_tu13__SVFloat32_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv4f32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat32x4_t test_svuzp_f32_x4(svfloat32_t zn1, svfloat32_t zn2, svfloat32_t zn3, svfloat32_t zn4) { + return SVE_ACLE_FUNC(svuzp,_f32,,,_x4)(zn1, zn2, zn3, zn4); +} + +// 64-bit UZPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv2i64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_s64_x4u11__SVInt64_tu11__SVInt64_tu11__SVInt64_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv2i64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint64x4_t test_svuzp_s64_x4(svint64_t zn1, svint64_t zn2, svint64_t zn3, svint64_t zn4) { + return SVE_ACLE_FUNC(svuzp,_s64,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv2i64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_u64_x4u12__SVUint64_tu12__SVUint64_tu12__SVUint64_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv2i64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint64x4_t test_svuzp_u64_x4(svuint64_t zn1, svuint64_t zn2, svuint64_t zn3, svuint64_t zn4) { + return SVE_ACLE_FUNC(svuzp,_u64,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv2f64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_f64_x4u13__SVFloat64_tu13__SVFloat64_tu13__SVFloat64_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv2f64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat64x4_t test_svuzp_f64_x4(svfloat64_t zn1, svfloat64_t zn2, svfloat64_t zn3, svfloat64_t zn4) { + return SVE_ACLE_FUNC(svuzp,_f64,,,_x4)(zn1, zn2, zn3, zn4); +} + +// 128-bit UZPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv16i8( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzpq_s8_x4u10__SVInt8_tu10__SVInt8_tu10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv16i8( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint8x4_t test_svuzpq_s8_x4(svint8_t zn1, svint8_t zn2, svint8_t zn3, svint8_t zn4) { + return SVE_ACLE_FUNC(svuzpq,_s8,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv16i8( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzpq_u8_x4u11__SVUint8_tu11__SVUint8_tu11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv16i8( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint8x4_t test_svuzpq_u8_x4(svuint8_t zn1, svuint8_t zn2, svuint8_t zn3, svuint8_t zn4) { + return SVE_ACLE_FUNC(svuzpq,_u8,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv8i16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_s16_x4u11__SVInt16_tu11__SVInt16_tu11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv8i16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint16x4_t test_svuzpq_s16_x4(svint16_t zn1, svint16_t zn2, svint16_t zn3, svint16_t zn4) { + return SVE_ACLE_FUNC(svuzpq,_s16,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv8i16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_u16_x4u12__SVUint16_tu12__SVUint16_tu12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv8i16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint16x4_t test_svuzpq_u16_x4(svuint16_t zn1, svuint16_t zn2, svuint16_t zn3, svuint16_t zn4) { + return SVE_ACLE_FUNC(svuzpq,_u16,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv8f16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_f16_x4u13__SVFloat16_tu13__SVFloat16_tu13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv8f16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat16x4_t test_svuzpq_f16_x4(svfloat16_t zn1, svfloat16_t zn2, svfloat16_t zn3, svfloat16_t zn4) { + return SVE_ACLE_FUNC(svuzpq,_f16,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_bf16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv8bf16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z19test_svuzpq_bf16_x4u14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv8bf16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svbfloat16x4_t test_svuzpq_bf16_x4(svbfloat16_t zn1, svbfloat16_t zn2, svbfloat16_t zn3, svbfloat16_t zn4) { + return SVE_ACLE_FUNC(svuzpq,_bf16,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv4i32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_s32_x4u11__SVInt32_tu11__SVInt32_tu11__SVInt32_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv4i32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint32x4_t test_svuzpq_s32_x4(svint32_t zn1, svint32_t zn2, svint32_t zn3, svint32_t zn4) { + return SVE_ACLE_FUNC(svuzpq,_s32,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv4i32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_u32_x4u12__SVUint32_tu12__SVUint32_tu12__SVUint32_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv4i32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint32x4_t test_svuzpq_u32_x4(svuint32_t zn1, svuint32_t zn2, svuint32_t zn3, svuint32_t zn4) { + return SVE_ACLE_FUNC(svuzpq,_u32,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv4f32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_f32_x4u13__SVFloat32_tu13__SVFloat32_tu13__SVFloat32_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv4f32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat32x4_t test_svuzpq_f32_x4(svfloat32_t zn1, svfloat32_t zn2, svfloat32_t zn3, svfloat32_t zn4) { + return SVE_ACLE_FUNC(svuzpq,_f32,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv2i64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_s64_x4u11__SVInt64_tu11__SVInt64_tu11__SVInt64_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv2i64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint64x4_t test_svuzpq_s64_x4(svint64_t zn1, svint64_t zn2, svint64_t zn3, svint64_t zn4) { + return SVE_ACLE_FUNC(svuzpq,_s64,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv2i64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_u64_x4u12__SVUint64_tu12__SVUint64_tu12__SVUint64_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv2i64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint64x4_t test_svuzpq_u64_x4(svuint64_t zn1, svuint64_t zn2, svuint64_t zn3, svuint64_t zn4) { + return SVE_ACLE_FUNC(svuzpq,_u64,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv2f64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_f64_x4u13__SVFloat64_tu13__SVFloat64_tu13__SVFloat64_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv2f64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat64x4_t test_svuzpq_f64_x4(svfloat64_t zn1, svfloat64_t zn2, svfloat64_t zn3, svfloat64_t zn4) { + return SVE_ACLE_FUNC(svuzpq,_f64,,,_x4)(zn1, zn2, zn3, zn4); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_pn.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_pn.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_pn.c @@ -0,0 +1,1056 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +// REQUIRES: aarch64-registered-target + +#include + + +// WHILEGE + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilege_c8_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilege_c8_vl2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilege_c8_vl2(int64_t op1, int64_t op2) +{ + return svwhilege_c8(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilege_c8_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilege_c8_vl4ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilege_c8_vl4(int64_t op1, int64_t op2) +{ + return svwhilege_c8(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilege_c16_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilege_c16_vl2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilege_c16_vl2(int64_t op1, int64_t op2) +{ + return svwhilege_c16(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilege_c16_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilege_c16_vl4ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilege_c16_vl4(int64_t op1, int64_t op2) +{ + return svwhilege_c16(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilege_c32_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilege_c32_vl2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilege_c32_vl2(int64_t op1, int64_t op2) +{ + return svwhilege_c32(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilege_c32_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilege_c32_vl4ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilege_c32_vl4(int64_t op1, int64_t op2) +{ + return svwhilege_c32(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilege_c64_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilege_c64_vl2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilege_c64_vl2(int64_t op1, int64_t op2) +{ + return svwhilege_c64(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilege_c64_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilege_c64_vl4ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilege_c64_vl4(int64_t op1, int64_t op2) +{ + return svwhilege_c64(op1, op2, 4); +} + + +// WHILEGT + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilegt_c8_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilegt_c8_vl2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilegt_c8_vl2(int64_t op1, int64_t op2) +{ + return svwhilegt_c8(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilegt_c8_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilegt_c8_vl4ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilegt_c8_vl4(int64_t op1, int64_t op2) +{ + return svwhilegt_c8(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilegt_c16_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilegt_c16_vl2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilegt_c16_vl2(int64_t op1, int64_t op2) +{ + return svwhilegt_c16(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilegt_c16_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilegt_c16_vl4ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilegt_c16_vl4(int64_t op1, int64_t op2) +{ + return svwhilegt_c16(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilegt_c32_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilegt_c32_vl2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilegt_c32_vl2(int64_t op1, int64_t op2) +{ + return svwhilegt_c32(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilegt_c32_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilegt_c32_vl4ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilegt_c32_vl4(int64_t op1, int64_t op2) +{ + return svwhilegt_c32(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilegt_c64_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilegt_c64_vl2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilegt_c64_vl2(int64_t op1, int64_t op2) +{ + return svwhilegt_c64(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilegt_c64_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilegt_c64_vl4ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilegt_c64_vl4(int64_t op1, int64_t op2) +{ + return svwhilegt_c64(op1, op2, 4); +} + + +// WHILEHI + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilehi_c8_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilehi_c8_vl2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilehi_c8_vl2(uint64_t op1, uint64_t op2) +{ + return svwhilehi_c8(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilehi_c8_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilehi_c8_vl4mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilehi_c8_vl4(uint64_t op1, uint64_t op2) +{ + return svwhilehi_c8(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilehi_c16_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilehi_c16_vl2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilehi_c16_vl2(uint64_t op1, uint64_t op2) +{ + return svwhilehi_c16(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilehi_c16_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilehi_c16_vl4mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilehi_c16_vl4(uint64_t op1, uint64_t op2) +{ + return svwhilehi_c16(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilehi_c32_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilehi_c32_vl2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilehi_c32_vl2(uint64_t op1, uint64_t op2) +{ + return svwhilehi_c32(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilehi_c32_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilehi_c32_vl4mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilehi_c32_vl4(uint64_t op1, uint64_t op2) +{ + return svwhilehi_c32(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilehi_c64_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilehi_c64_vl2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilehi_c64_vl2(uint64_t op1, uint64_t op2) +{ + return svwhilehi_c64(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilehi_c64_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilehi_c64_vl4mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilehi_c64_vl4(uint64_t op1, uint64_t op2) +{ + return svwhilehi_c64(op1, op2, 4); +} + + +// WHILEHS + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilehs_c8_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilehs_c8_vl2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilehs_c8_vl2(uint64_t op1, uint64_t op2) +{ + return svwhilehs_c8(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilehs_c8_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilehs_c8_vl4mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilehs_c8_vl4(uint64_t op1, uint64_t op2) +{ + return svwhilehs_c8(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilehs_c16_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilehs_c16_vl2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilehs_c16_vl2(uint64_t op1, uint64_t op2) +{ + return svwhilehs_c16(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilehs_c16_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilehs_c16_vl4mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilehs_c16_vl4(uint64_t op1, uint64_t op2) +{ + return svwhilehs_c16(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilehs_c32_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilehs_c32_vl2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilehs_c32_vl2(uint64_t op1, uint64_t op2) +{ + return svwhilehs_c32(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilehs_c32_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilehs_c32_vl4mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilehs_c32_vl4(uint64_t op1, uint64_t op2) +{ + return svwhilehs_c32(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilehs_c64_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilehs_c64_vl2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilehs_c64_vl2(uint64_t op1, uint64_t op2) +{ + return svwhilehs_c64(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilehs_c64_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilehs_c64_vl4mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilehs_c64_vl4(uint64_t op1, uint64_t op2) +{ + return svwhilehs_c64(op1, op2, 4); +} + + +// WHILELE + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilele_c8_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilele_c8_vl2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilele_c8_vl2(int64_t op1, int64_t op2) +{ + return svwhilele_c8(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilele_c8_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilele_c8_vl4ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilele_c8_vl4(int64_t op1, int64_t op2) +{ + return svwhilele_c8(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilele_c16_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilele_c16_vl2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilele_c16_vl2(int64_t op1, int64_t op2) +{ + return svwhilele_c16(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilele_c16_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilele_c16_vl4ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilele_c16_vl4(int64_t op1, int64_t op2) +{ + return svwhilele_c16(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilele_c32_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilele_c32_vl2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilele_c32_vl2(int64_t op1, int64_t op2) +{ + return svwhilele_c32(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilele_c32_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilele_c32_vl4ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilele_c32_vl4(int64_t op1, int64_t op2) +{ + return svwhilele_c32(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilele_c64_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilele_c64_vl2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilele_c64_vl2(int64_t op1, int64_t op2) +{ + return svwhilele_c64(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilele_c64_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilele_c64_vl4ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilele_c64_vl4(int64_t op1, int64_t op2) +{ + return svwhilele_c64(op1, op2, 4); +} + + +// WHILELO + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilelo_c8_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilelo_c8_vl2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilelo_c8_vl2(uint64_t op1, uint64_t op2) +{ + return svwhilelo_c8(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilelo_c8_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilelo_c8_vl4mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilelo_c8_vl4(uint64_t op1, uint64_t op2) +{ + return svwhilelo_c8(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilelo_c16_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilelo_c16_vl2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilelo_c16_vl2(uint64_t op1, uint64_t op2) +{ + return svwhilelo_c16(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilelo_c16_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilelo_c16_vl4mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilelo_c16_vl4(uint64_t op1, uint64_t op2) +{ + return svwhilelo_c16(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilelo_c32_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilelo_c32_vl2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilelo_c32_vl2(uint64_t op1, uint64_t op2) +{ + return svwhilelo_c32(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilelo_c32_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilelo_c32_vl4mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilelo_c32_vl4(uint64_t op1, uint64_t op2) +{ + return svwhilelo_c32(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilelo_c64_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilelo_c64_vl2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilelo_c64_vl2(uint64_t op1, uint64_t op2) +{ + return svwhilelo_c64(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilelo_c64_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilelo_c64_vl4mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilelo_c64_vl4(uint64_t op1, uint64_t op2) +{ + return svwhilelo_c64(op1, op2, 4); +} + + +// WHILELS + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilels_c8_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilels_c8_vl2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilels_c8_vl2(uint64_t op1, uint64_t op2) +{ + return svwhilels_c8(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilels_c8_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilels_c8_vl4mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilels_c8_vl4(uint64_t op1, uint64_t op2) +{ + return svwhilels_c8(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilels_c16_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilels_c16_vl2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilels_c16_vl2(uint64_t op1, uint64_t op2) +{ + return svwhilels_c16(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilels_c16_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilels_c16_vl4mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilels_c16_vl4(uint64_t op1, uint64_t op2) +{ + return svwhilels_c16(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilels_c32_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilels_c32_vl2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilels_c32_vl2(uint64_t op1, uint64_t op2) +{ + return svwhilels_c32(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilels_c32_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilels_c32_vl4mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilels_c32_vl4(uint64_t op1, uint64_t op2) +{ + return svwhilels_c32(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilels_c64_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilels_c64_vl2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilels_c64_vl2(uint64_t op1, uint64_t op2) +{ + return svwhilels_c64(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilels_c64_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilels_c64_vl4mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilels_c64_vl4(uint64_t op1, uint64_t op2) +{ + return svwhilels_c64(op1, op2, 4); +} + + +// WHILELT + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilelt_c8_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilelt_c8_vl2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilelt_c8_vl2(int64_t op1, int64_t op2) +{ + return svwhilelt_c8(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilelt_c8_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilelt_c8_vl4ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilelt_c8_vl4(int64_t op1, int64_t op2) +{ + return svwhilelt_c8(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilelt_c16_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilelt_c16_vl2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilelt_c16_vl2(int64_t op1, int64_t op2) +{ + return svwhilelt_c16(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilelt_c16_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilelt_c16_vl4ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilelt_c16_vl4(int64_t op1, int64_t op2) +{ + return svwhilelt_c16(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilelt_c32_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilelt_c32_vl2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilelt_c32_vl2(int64_t op1, int64_t op2) +{ + return svwhilelt_c32(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilelt_c32_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilelt_c32_vl4ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilelt_c32_vl4(int64_t op1, int64_t op2) +{ + return svwhilelt_c32(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilelt_c64_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilelt_c64_vl2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilelt_c64_vl2(int64_t op1, int64_t op2) +{ + return svwhilelt_c64(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilelt_c64_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilelt_c64_vl4ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilelt_c64_vl4(int64_t op1, int64_t op2) +{ + return svwhilelt_c64(op1, op2, 4); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_pp.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_pp.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_pp.c @@ -0,0 +1,894 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// REQUIRES: aarch64-registered-target +#include + + +// WHILEGE + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilege_b8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilege.x2.nxv16i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_svwhilege_b8_x2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilege.x2.nxv16i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svboolx2_t test_svwhilege_b8_x2(int64_t op1, int64_t op2) +{ + return svwhilege_b8_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilege_b16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilege.x2.nxv8i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilege_b16_x2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilege.x2.nxv8i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilege_b16_x2(int64_t op1, int64_t op2) +{ + return svwhilege_b16_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilege_b32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilege.x2.nxv4i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilege_b32_x2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilege.x2.nxv4i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilege_b32_x2(int64_t op1, int64_t op2) +{ + return svwhilege_b32_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilege_b64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilege.x2.nxv2i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilege_b64_x2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilege.x2.nxv2i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilege_b64_x2(int64_t op1, int64_t op2) +{ + return svwhilege_b64_x2(op1, op2); +} + + +// WHILEGT + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilegt_b8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilegt.x2.nxv16i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_svwhilegt_b8_x2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilegt.x2.nxv16i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svboolx2_t test_svwhilegt_b8_x2(int64_t op1, int64_t op2) +{ + return svwhilegt_b8_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilegt_b16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilegt.x2.nxv8i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilegt_b16_x2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilegt.x2.nxv8i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilegt_b16_x2(int64_t op1, int64_t op2) +{ + return svwhilegt_b16_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilegt_b32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilegt.x2.nxv4i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilegt_b32_x2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilegt.x2.nxv4i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilegt_b32_x2(int64_t op1, int64_t op2) +{ + return svwhilegt_b32_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilegt_b64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilegt.x2.nxv2i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilegt_b64_x2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilegt.x2.nxv2i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilegt_b64_x2(int64_t op1, int64_t op2) +{ + return svwhilegt_b64_x2(op1, op2); +} + + +// WHILEHI + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilehi_b8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilehi.x2.nxv16i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_svwhilehi_b8_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilehi.x2.nxv16i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svboolx2_t test_svwhilehi_b8_x2(uint64_t op1, uint64_t op2) +{ + return svwhilehi_b8_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilehi_b16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilehi.x2.nxv8i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilehi_b16_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilehi.x2.nxv8i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilehi_b16_x2(uint64_t op1, uint64_t op2) +{ + return svwhilehi_b16_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilehi_b32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilehi.x2.nxv4i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilehi_b32_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilehi.x2.nxv4i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilehi_b32_x2(uint64_t op1, uint64_t op2) +{ + return svwhilehi_b32_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilehi_b64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilehi.x2.nxv2i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilehi_b64_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilehi.x2.nxv2i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilehi_b64_x2(uint64_t op1, uint64_t op2) +{ + return svwhilehi_b64_x2(op1, op2); +} + + +// WHILEHS + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilehs_b8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilehs.x2.nxv16i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_svwhilehs_b8_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilehs.x2.nxv16i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svboolx2_t test_svwhilehs_b8_x2(uint64_t op1, uint64_t op2) +{ + return svwhilehs_b8_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilehs_b16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilehs.x2.nxv8i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilehs_b16_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilehs.x2.nxv8i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilehs_b16_x2(uint64_t op1, uint64_t op2) +{ + return svwhilehs_b16_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilehs_b32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilehs.x2.nxv4i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilehs_b32_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilehs.x2.nxv4i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilehs_b32_x2(uint64_t op1, uint64_t op2) +{ + return svwhilehs_b32_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilehs_b64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilehs.x2.nxv2i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilehs_b64_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilehs.x2.nxv2i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilehs_b64_x2(uint64_t op1, uint64_t op2) +{ + return svwhilehs_b64_x2(op1, op2); +} + + +// WHILELE + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilele_b8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilele.x2.nxv16i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_svwhilele_b8_x2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilele.x2.nxv16i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svboolx2_t test_svwhilele_b8_x2(int64_t op1, int64_t op2) +{ + return svwhilele_b8_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilele_b16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilele.x2.nxv8i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilele_b16_x2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilele.x2.nxv8i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilele_b16_x2(int64_t op1, int64_t op2) +{ + return svwhilele_b16_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilele_b32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilele.x2.nxv4i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilele_b32_x2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilele.x2.nxv4i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilele_b32_x2(int64_t op1, int64_t op2) +{ + return svwhilele_b32_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilele_b64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilele.x2.nxv2i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilele_b64_x2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilele.x2.nxv2i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilele_b64_x2(int64_t op1, int64_t op2) +{ + return svwhilele_b64_x2(op1, op2); +} + + +// WHILELO + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilelo_b8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilelo.x2.nxv16i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_svwhilelo_b8_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilelo.x2.nxv16i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svboolx2_t test_svwhilelo_b8_x2(uint64_t op1, uint64_t op2) +{ + return svwhilelo_b8_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilelo_b16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilelo.x2.nxv8i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilelo_b16_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilelo.x2.nxv8i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilelo_b16_x2(uint64_t op1, uint64_t op2) +{ + return svwhilelo_b16_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilelo_b32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilelo.x2.nxv4i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilelo_b32_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilelo.x2.nxv4i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilelo_b32_x2(uint64_t op1, uint64_t op2) +{ + return svwhilelo_b32_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilelo_b64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilelo.x2.nxv2i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilelo_b64_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilelo.x2.nxv2i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilelo_b64_x2(uint64_t op1, uint64_t op2) +{ + return svwhilelo_b64_x2(op1, op2); +} + + +// WHILELS + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilels_b8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilels.x2.nxv16i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_svwhilels_b8_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilels.x2.nxv16i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svboolx2_t test_svwhilels_b8_x2(uint64_t op1, uint64_t op2) +{ + return svwhilels_b8_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilels_b16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilels.x2.nxv8i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilels_b16_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilels.x2.nxv8i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilels_b16_x2(uint64_t op1, uint64_t op2) +{ + return svwhilels_b16_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilels_b32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilels.x2.nxv4i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilels_b32_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilels.x2.nxv4i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilels_b32_x2(uint64_t op1, uint64_t op2) +{ + return svwhilels_b32_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilels_b64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilels.x2.nxv2i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilels_b64_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilels.x2.nxv2i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilels_b64_x2(uint64_t op1, uint64_t op2) +{ + return svwhilels_b64_x2(op1, op2); +} + + +// WHILELT + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilelt_b8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilelt.x2.nxv16i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_svwhilelt_b8_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilelt.x2.nxv16i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svboolx2_t test_svwhilelt_b8_x2(uint64_t op1, uint64_t op2) +{ + return svwhilelt_b8_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilelt_b16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilelt.x2.nxv8i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilelt_b16_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilelt.x2.nxv8i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilelt_b16_x2(uint64_t op1, uint64_t op2) +{ + return svwhilelt_b16_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilelt_b32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilelt.x2.nxv4i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilelt_b32_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilelt.x2.nxv4i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilelt_b32_x2(uint64_t op1, uint64_t op2) +{ + return svwhilelt_b32_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilelt_b64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilelt.x2.nxv2i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilelt_b64_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilelt.x2.nxv2i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilelt_b64_x2(uint64_t op1, uint64_t op2) +{ + return svwhilelt_b64_x2(op1, op2); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_zipx2.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_zipx2.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_zipx2.c @@ -0,0 +1,577 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// 8-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv16i8( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z16test_svzip_s8_x2u10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv16i8( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint8x2_t test_svzip_s8_x2(svint8_t zn, svint8_t zm) { + return SVE_ACLE_FUNC(svzip,_s8,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv16i8( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z16test_svzip_u8_x2u11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv16i8( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint8x2_t test_svzip_u8_x2(svuint8_t zn, svuint8_t zm) { + return SVE_ACLE_FUNC(svzip,_u8,_x2,)(zn, zm); +} + +// 16-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv8i16( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_s16_x2u11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv8i16( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint16x2_t test_svzip_s16_x2(svint16_t zn, svint16_t zm) { + return SVE_ACLE_FUNC(svzip,_s16,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv8i16( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_u16_x2u12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv8i16( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint16x2_t test_svzip_u16_x2(svuint16_t zn, svuint16_t zm) { + return SVE_ACLE_FUNC(svzip,_u16,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv8f16( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_f16_x2u13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv8f16( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat16x2_t test_svzip_f16_x2(svfloat16_t zn, svfloat16_t zm) { + return SVE_ACLE_FUNC(svzip,_f16,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_bf16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv8bf16( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svzip_bf16_x2u14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv8bf16( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svbfloat16x2_t test_svzip_bf16_x2(svbfloat16_t zn, svbfloat16_t zm) { + return SVE_ACLE_FUNC(svzip,_bf16,_x2,)(zn, zm); +} + +// 32-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv4i32( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_s32_x2u11__SVInt32_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv4i32( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint32x2_t test_svzip_s32_x2(svint32_t zn, svint32_t zm) { + return SVE_ACLE_FUNC(svzip,_s32,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv4i32( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_u32_x2u12__SVUint32_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv4i32( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint32x2_t test_svzip_u32_x2(svuint32_t zn, svuint32_t zm) { + return SVE_ACLE_FUNC(svzip,_u32,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv4f32( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_f32_x2u13__SVFloat32_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv4f32( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat32x2_t test_svzip_f32_x2(svfloat32_t zn, svfloat32_t zm) { + return SVE_ACLE_FUNC(svzip,_f32,_x2,)(zn, zm); +} + +// 64-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv2i64( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_s64_x2u11__SVInt64_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv2i64( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint64x2_t test_svzip_s64_x2(svint64_t zn, svint64_t zm) { + return SVE_ACLE_FUNC(svzip,_s64,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv2i64( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_u64_x2u12__SVUint64_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv2i64( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint64x2_t test_svzip_u64_x2(svuint64_t zn, svuint64_t zm) { + return SVE_ACLE_FUNC(svzip,_u64,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv2f64( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_f64_x2u13__SVFloat64_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv2f64( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat64x2_t test_svzip_f64_x2(svfloat64_t zn, svfloat64_t zm) { + return SVE_ACLE_FUNC(svzip,_f64,_x2,)(zn, zm); +} + +// 128-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv16i8( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svzipq_s8_x2u10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv16i8( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint8x2_t test_svzipq_s8_x2(svint8_t zn, svint8_t zm) { + return SVE_ACLE_FUNC(svzipq,_s8,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv16i8( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svzipq_u8_x2u11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv16i8( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint8x2_t test_svzipq_u8_x2(svuint8_t zn, svuint8_t zm) { + return SVE_ACLE_FUNC(svzipq,_u8,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv8i16( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_s16_x2u11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv8i16( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint16x2_t test_svzipq_s16_x2(svint16_t zn, svint16_t zm) { + return SVE_ACLE_FUNC(svzipq,_s16,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv8i16( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_u16_x2u12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv8i16( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint16x2_t test_svzipq_u16_x2(svuint16_t zn, svuint16_t zm) { + return SVE_ACLE_FUNC(svzipq,_u16,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv8f16( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_f16_x2u13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv8f16( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat16x2_t test_svzipq_f16_x2(svfloat16_t zn, svfloat16_t zm) { + return SVE_ACLE_FUNC(svzipq,_f16,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_bf16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv8bf16( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z19test_svzipq_bf16_x2u14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv8bf16( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svbfloat16x2_t test_svzipq_bf16_x2(svbfloat16_t zn, svbfloat16_t zm) { + return SVE_ACLE_FUNC(svzipq,_bf16,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv4i32( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_s32_x2u11__SVInt32_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv4i32( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint32x2_t test_svzipq_s32_x2(svint32_t zn, svint32_t zm) { + return SVE_ACLE_FUNC(svzipq,_s32,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv4i32( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_u32_x2u12__SVUint32_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv4i32( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint32x2_t test_svzipq_u32_x2(svuint32_t zn, svuint32_t zm) { + return SVE_ACLE_FUNC(svzipq,_u32,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv4f32( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_f32_x2u13__SVFloat32_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv4f32( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat32x2_t test_svzipq_f32_x2(svfloat32_t zn, svfloat32_t zm) { + return SVE_ACLE_FUNC(svzipq,_f32,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv2i64( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_s64_x2u11__SVInt64_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv2i64( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint64x2_t test_svzipq_s64_x2(svint64_t zn, svint64_t zm) { + return SVE_ACLE_FUNC(svzipq,_s64,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv2i64( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_u64_x2u12__SVUint64_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv2i64( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint64x2_t test_svzipq_u64_x2(svuint64_t zn, svuint64_t zm) { + return SVE_ACLE_FUNC(svzipq,_u64,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv2f64( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_f64_x2u13__SVFloat64_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv2f64( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat64x2_t test_svzipq_f64_x2(svfloat64_t zn, svfloat64_t zm) { + return SVE_ACLE_FUNC(svzipq,_f64,_x2,)(zn, zm); +} + diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_zipx4.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_zipx4.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_zipx4.c @@ -0,0 +1,768 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// 8-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv16i8( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z16test_svzip_s8_x4u10__SVInt8_tu10__SVInt8_tu10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv16i8( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint8x4_t test_svzip_s8_x4(svint8_t zn1, svint8_t zn2, svint8_t zn3, svint8_t zn4) { + return SVE_ACLE_FUNC(svzip,_s8,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv16i8( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z16test_svzip_u8_x4u11__SVUint8_tu11__SVUint8_tu11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv16i8( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint8x4_t test_svzip_u8_x4(svuint8_t zn1, svuint8_t zn2, svuint8_t zn3, svuint8_t zn4) { + return SVE_ACLE_FUNC(svzip,_u8,,,_x4)(zn1, zn2, zn3, zn4); +} + +// 16-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv8i16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_s16_x4u11__SVInt16_tu11__SVInt16_tu11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv8i16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint16x4_t test_svzip_s16_x4(svint16_t zn1, svint16_t zn2, svint16_t zn3, svint16_t zn4) { + return SVE_ACLE_FUNC(svzip,_s16,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv8i16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_u16_x4u12__SVUint16_tu12__SVUint16_tu12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv8i16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint16x4_t test_svzip_u16_x4(svuint16_t zn1, svuint16_t zn2, svuint16_t zn3, svuint16_t zn4) { + return SVE_ACLE_FUNC(svzip,_u16,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv8f16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_f16_x4u13__SVFloat16_tu13__SVFloat16_tu13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv8f16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat16x4_t test_svzip_f16_x4(svfloat16_t zn1, svfloat16_t zn2, svfloat16_t zn3, svfloat16_t zn4) { + return SVE_ACLE_FUNC(svzip,_f16,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_bf16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv8bf16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svzip_bf16_x4u14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv8bf16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svbfloat16x4_t test_svzip_bf16_x4(svbfloat16_t zn1, svbfloat16_t zn2, svbfloat16_t zn3, svbfloat16_t zn4) { + return SVE_ACLE_FUNC(svzip,_bf16,,,_x4)(zn1, zn2, zn3, zn4); +} + +// 32-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv4i32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_s32_x4u11__SVInt32_tu11__SVInt32_tu11__SVInt32_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv4i32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint32x4_t test_svzip_s32_x4(svint32_t zn1, svint32_t zn2, svint32_t zn3, svint32_t zn4) { + return SVE_ACLE_FUNC(svzip,_s32,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv4i32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_u32_x4u12__SVUint32_tu12__SVUint32_tu12__SVUint32_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv4i32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint32x4_t test_svzip_u32_x4(svuint32_t zn1, svuint32_t zn2, svuint32_t zn3, svuint32_t zn4) { + return SVE_ACLE_FUNC(svzip,_u32,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv4f32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_f32_x4u13__SVFloat32_tu13__SVFloat32_tu13__SVFloat32_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv4f32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat32x4_t test_svzip_f32_x4(svfloat32_t zn1, svfloat32_t zn2, svfloat32_t zn3, svfloat32_t zn4) { + return SVE_ACLE_FUNC(svzip,_f32,,,_x4)(zn1, zn2, zn3, zn4); +} + +// 64-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv2i64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_s64_x4u11__SVInt64_tu11__SVInt64_tu11__SVInt64_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv2i64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint64x4_t test_svzip_s64_x4(svint64_t zn1, svint64_t zn2, svint64_t zn3, svint64_t zn4) { + return SVE_ACLE_FUNC(svzip,_s64,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv2i64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_u64_x4u12__SVUint64_tu12__SVUint64_tu12__SVUint64_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv2i64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint64x4_t test_svzip_u64_x4(svuint64_t zn1, svuint64_t zn2, svuint64_t zn3, svuint64_t zn4) { + return SVE_ACLE_FUNC(svzip,_u64,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv2f64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_f64_x4u13__SVFloat64_tu13__SVFloat64_tu13__SVFloat64_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv2f64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat64x4_t test_svzip_f64_x4(svfloat64_t zn1, svfloat64_t zn2, svfloat64_t zn3, svfloat64_t zn4) { + return SVE_ACLE_FUNC(svzip,_f64,,,_x4)(zn1, zn2, zn3, zn4); +} + +// 128-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv16i8( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svzipq_s8_x4u10__SVInt8_tu10__SVInt8_tu10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv16i8( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint8x4_t test_svzipq_s8_x4(svint8_t zn1, svint8_t zn2, svint8_t zn3, svint8_t zn4) { + return SVE_ACLE_FUNC(svzipq,_s8,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv16i8( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svzipq_u8_x4u11__SVUint8_tu11__SVUint8_tu11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv16i8( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint8x4_t test_svzipq_u8_x4(svuint8_t zn1, svuint8_t zn2, svuint8_t zn3, svuint8_t zn4) { + return SVE_ACLE_FUNC(svzipq,_u8,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv8i16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_s16_x4u11__SVInt16_tu11__SVInt16_tu11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv8i16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint16x4_t test_svzipq_s16_x4(svint16_t zn1, svint16_t zn2, svint16_t zn3, svint16_t zn4) { + return SVE_ACLE_FUNC(svzipq,_s16,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv8i16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_u16_x4u12__SVUint16_tu12__SVUint16_tu12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv8i16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint16x4_t test_svzipq_u16_x4(svuint16_t zn1, svuint16_t zn2, svuint16_t zn3, svuint16_t zn4) { + return SVE_ACLE_FUNC(svzipq,_u16,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv8f16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_f16_x4u13__SVFloat16_tu13__SVFloat16_tu13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv8f16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat16x4_t test_svzipq_f16_x4(svfloat16_t zn1, svfloat16_t zn2, svfloat16_t zn3, svfloat16_t zn4) { + return SVE_ACLE_FUNC(svzipq,_f16,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_bf16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv8bf16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z19test_svzipq_bf16_x4u14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv8bf16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svbfloat16x4_t test_svzipq_bf16_x4(svbfloat16_t zn1, svbfloat16_t zn2, svbfloat16_t zn3, svbfloat16_t zn4) { + return SVE_ACLE_FUNC(svzipq,_bf16,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv4i32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_s32_x4u11__SVInt32_tu11__SVInt32_tu11__SVInt32_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv4i32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint32x4_t test_svzipq_s32_x4(svint32_t zn1, svint32_t zn2, svint32_t zn3, svint32_t zn4) { + return SVE_ACLE_FUNC(svzipq,_s32,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv4i32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_u32_x4u12__SVUint32_tu12__SVUint32_tu12__SVUint32_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv4i32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint32x4_t test_svzipq_u32_x4(svuint32_t zn1, svuint32_t zn2, svuint32_t zn3, svuint32_t zn4) { + return SVE_ACLE_FUNC(svzipq,_u32,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv4f32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_f32_x4u13__SVFloat32_tu13__SVFloat32_tu13__SVFloat32_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv4f32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat32x4_t test_svzipq_f32_x4(svfloat32_t zn1, svfloat32_t zn2, svfloat32_t zn3, svfloat32_t zn4) { + return SVE_ACLE_FUNC(svzipq,_f32,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv2i64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_s64_x4u11__SVInt64_tu11__SVInt64_tu11__SVInt64_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv2i64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint64x4_t test_svzipq_s64_x4(svint64_t zn1, svint64_t zn2, svint64_t zn3, svint64_t zn4) { + return SVE_ACLE_FUNC(svzipq,_s64,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv2i64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_u64_x4u12__SVUint64_tu12__SVUint64_tu12__SVUint64_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv2i64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint64x4_t test_svzipq_u64_x4(svuint64_t zn1, svuint64_t zn2, svuint64_t zn3, svuint64_t zn4) { + return SVE_ACLE_FUNC(svzipq,_u64,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv2f64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_f64_x4u13__SVFloat64_tu13__SVFloat64_tu13__SVFloat64_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv2f64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat64x4_t test_svzipq_f64_x4(svfloat64_t zn1, svfloat64_t zn2, svfloat64_t zn3, svfloat64_t zn4) { + return SVE_ACLE_FUNC(svzipq,_f64,,,_x4)(zn1, zn2, zn3, zn4); +} diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test --- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test +++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test @@ -18,6 +18,8 @@ // CHECK-NEXT: AnyX86NoCfCheck (SubjectMatchRule_hasType_functionType) // CHECK-NEXT: ArcWeakrefUnavailable (SubjectMatchRule_objc_interface) // CHECK-NEXT: ArmBuiltinAlias (SubjectMatchRule_function) +// CHECK-NEXT: ArmLocallyStreaming (SubjectMatchRule_function) +// CHECK-NEXT: ArmNewZA (SubjectMatchRule_function) // CHECK-NEXT: AssumeAligned (SubjectMatchRule_objc_method, SubjectMatchRule_function) // CHECK-NEXT: Assumption (SubjectMatchRule_function, SubjectMatchRule_objc_method) // CHECK-NEXT: Availability ((SubjectMatchRule_record, SubjectMatchRule_enum, SubjectMatchRule_enum_constant, SubjectMatchRule_field, SubjectMatchRule_function, SubjectMatchRule_namespace, SubjectMatchRule_objc_category, SubjectMatchRule_objc_implementation, SubjectMatchRule_objc_interface, SubjectMatchRule_objc_method, SubjectMatchRule_objc_property, SubjectMatchRule_objc_protocol, SubjectMatchRule_record, SubjectMatchRule_type_alias, SubjectMatchRule_variable)) diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c --- a/clang/test/Preprocessor/aarch64-target-features.c +++ b/clang/test/Preprocessor/aarch64-target-features.c @@ -228,6 +228,10 @@ // RUN: %clang -target aarch64-none-linux-gnu -march=armv9-a+sve2-bitperm -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2BITPERM %s // CHECK-SVE2BITPERM: __ARM_FEATURE_SVE2_BITPERM 1 +// RUN: %clang -target aarch64-none-linux-gnu -march=armv8-a+sme -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SME-BF16 %s +// CHECK-SME-BF16: __ARM_FEATURE_SME 1 +// CHECK-SME-BF16: __ARM_FEATURE_UNALIGNED 1 + // RUN: %clang -target aarch64-none-linux-gnu -march=armv8.2a+dotprod -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-DOTPROD %s // RUN: %clang -target aarch64-none-linux-gnu -march=armv8.4a -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-DOTPROD %s // CHECK-DOTPROD: __ARM_FEATURE_DOTPROD 1 diff --git a/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c new file mode 100644 --- /dev/null +++ b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c @@ -0,0 +1,125 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve \ +// RUN: -target-feature +sme -target-feature +sve2 -target-feature +neon -fsyntax-only -verify %s + +// REQUIRES: aarch64-registered-target + +#include "arm_neon.h" +#include "arm_sme.h" +#include "arm_sve.h" + +__attribute__((arm_streaming)) int16x8_t incompat_neon_sm(int16x8_t splat) { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming or locally streaming function}} + return (int16x8_t)__builtin_neon_vqaddq_v((int8x16_t)splat, (int8x16_t)splat, 33); +} + +__attribute__((arm_locally_streaming)) int16x8_t incompat_neon_ls(int16x8_t splat) { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming or locally streaming function}} + return (int16x8_t)__builtin_neon_vqaddq_v((int8x16_t)splat, (int8x16_t)splat, 33); +} + +__attribute__((arm_streaming_compatible)) int16x8_t incompat_neon_smc(int16x8_t splat) { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming compatible function}} + return (int16x8_t)__builtin_neon_vqaddq_v((int8x16_t)splat, (int8x16_t)splat, 33); +} + +__attribute__((arm_shared_za)) void incompat_sme_norm(svbool_t pg, void const *ptr) { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a non-streaming function}} + return __builtin_sme_svld1_hor_za128(0, 0, 0, pg, ptr); +} + +__attribute__((arm_streaming_compatible, arm_shared_za)) void incompat_sme_smc(svbool_t pg, void const *ptr) { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming compatible function}} + return __builtin_sme_svld1_hor_za128(0, 0, 0, pg, ptr); +} + +__attribute__((arm_streaming)) +svuint32_t +incompat_sve_sm(svbool_t pg, svuint32_t a, int16_t b) { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming or locally streaming function}} + return __builtin_sve_svld1_gather_u32base_index_u32(pg, a, b); +} + +__attribute__((arm_locally_streaming)) +svuint32_t +incompat_sve_ls(svbool_t pg, svuint32_t a, int64_t b) { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming or locally streaming function}} + return __builtin_sve_svld1_gather_u32base_index_u32(pg, a, b); +} + +__attribute__((arm_streaming_compatible)) +svuint32_t +incompat_sve_smc(svbool_t pg, svuint32_t a, int64_t b) { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming compatible function}} + return __builtin_sve_svld1_gather_u32base_index_u32(pg, a, b); +} + +__attribute__((arm_streaming)) +svuint32_t +incompat_sve2_sm(svbool_t pg, svuint32_t a, int64_t b) { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming or locally streaming function}} + return __builtin_sve_svldnt1_gather_u32base_index_u32(pg, a, b); +} + +__attribute__((arm_locally_streaming)) +svuint32_t +incompat_sve2_ls(svbool_t pg, svuint32_t a, int64_t b) { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming or locally streaming function}} + return __builtin_sve_svldnt1_gather_u32base_index_u32(pg, a, b); +} + +__attribute__((arm_streaming_compatible)) +svuint32_t +incompat_sve2_smc(svbool_t pg, svuint32_t a, int64_t b) { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming compatible function}} + return __builtin_sve_svldnt1_gather_u32base_index_u32(pg, a, b); +} + +__attribute__((arm_shared_za)) +void +incompat_sme_sm(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a non-streaming function}} + svmops_za32_f32_m(0, pn, pm, zn, zm); +} + +__attribute__((arm_streaming)) svfloat64_t +streaming_caller_sve(svbool_t pg, svfloat64_t a, float64_t b) { + // expected-no-warning + return svadd_n_f64_m(pg, a, b); +} + +__attribute__((arm_locally_streaming)) svfloat64_t +locally_streaming_caller_sve(svbool_t pg, svfloat64_t a, float64_t b) { + // expected-no-warning + return svadd_n_f64_m(pg, a, b); +} + +__attribute__((arm_streaming_compatible)) svfloat64_t +streaming_compatible_caller_sve(svbool_t pg, svfloat64_t a, float64_t b) { + // expected-no-warning + return svadd_n_f64_m(pg, a, b); +} + +__attribute__((arm_streaming)) svint16_t +streaming_caller_sve2(svint16_t op1, svint16_t op2) { + // expected-no-warning + return svmul_lane_s16(op1, op2, 0); +} + +__attribute__((arm_locally_streaming)) svint16_t +locally_streaming_caller_sve2(svint16_t op1, svint16_t op2) { + // expected-no-warning + return svmul_lane_s16(op1, op2, 0); +} + +__attribute__((arm_streaming_compatible)) svint16_t +streaming_compatible_caller_sve2(svint16_t op1, svint16_t op2) { + // expected-no-warning + return svmul_lane_s16(op1, op2, 0); +} + +__attribute__((arm_streaming)) svbool_t +streaming_caller_ptrue(void) { + // expected-no-warning + return svand_z(svptrue_b16(), svptrue_pat_b16(SV_ALL), svptrue_pat_b16(SV_VL4)); +} diff --git a/clang/test/Sema/aarch64-incompat-za-builtin-calls.c b/clang/test/Sema/aarch64-incompat-za-builtin-calls.c new file mode 100644 --- /dev/null +++ b/clang/test/Sema/aarch64-incompat-za-builtin-calls.c @@ -0,0 +1,27 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -fsyntax-only -verify %s + +// REQUIRES: aarch64-registered-target + +#include "arm_sme.h" + +void requires_za_state(void) { + // expected-warning@+1 {{builtin call is not valid when calling from a function without active ZA state}} + return svzero_mask_za(0); +} + +// No diagnostic expected +__attribute__((arm_new_za)) void with_new_za_state(void) { + return svzero_mask_za(0); +} + +// No diagnostic expected +__attribute__((arm_shared_za)) void with_shared_za_state(void) { + return svzero_mask_za(0); +} + +// FIXME: svzero_za() is defined in arm_sme.h, but is not a builtin. +// Clang should give a similar diagnostic for 'regular' calls to shared_za +// functions when the caller doesn't have ZA state. +__attribute__((arm_shared_za)) void requires_za_state_svzero_no_mask(void) { + return svzero_za(); +} diff --git a/clang/test/Sema/aarch64-sme-attrs-no-sme.c b/clang/test/Sema/aarch64-sme-attrs-no-sme.c new file mode 100644 --- /dev/null +++ b/clang/test/Sema/aarch64-sme-attrs-no-sme.c @@ -0,0 +1,36 @@ +// Test that the attribute is ignored if we don't compile for both AArch64 with +sme. +// +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -fsyntax-only -verify %s +// RUN: %clang_cc1 -triple x86_64-none-linux-gnu -target-feature +sme -fsyntax-only -verify %s + +extern int normal_callee(void); + +// expected-warning@+1 {{unknown attribute 'arm_streaming' ignored}} +__attribute__((arm_streaming)) +int streaming_caller(void) { + return normal_callee(); +} + +// expected-warning@+1 {{unknown attribute 'arm_locally_streaming' ignored}} +__attribute__((arm_locally_streaming)) +int locally_streaming_caller(void) { + return normal_callee(); +} + +// expected-warning@+1 {{unknown attribute 'arm_shared_za' ignored}} +__attribute__((arm_shared_za)) +int shared_za_caller(void) { + return normal_callee(); +} + +// expected-warning@+1 {{unknown attribute 'arm_preserves_za' ignored}} +__attribute__((arm_preserves_za)) +int preserves_za_caller(void) { + return normal_callee(); +} + +// expected-warning@+1 {{unknown attribute 'arm_new_za' ignored}} +__attribute__((arm_new_za)) +int new_za_caller(void) { + return normal_callee(); +} diff --git a/clang/test/Sema/aarch64-sme-attrs-on-x86.c b/clang/test/Sema/aarch64-sme-attrs-on-x86.c new file mode 100644 --- /dev/null +++ b/clang/test/Sema/aarch64-sme-attrs-on-x86.c @@ -0,0 +1,41 @@ +// RUN: %clang_cc1 -triple x86_64-none-linux-gnu -target-feature +sme -fsyntax-only -verify %s + +// REQUIRES: aarch64-registered-target + +extern int normal_callee(); + +// expected-warning@+1 {{unknown attribute 'arm_streaming' ignored}} +__attribute__((arm_streaming)) +int streaming_caller(void) { + return normal_callee(); +} + +// expected-warning@+1 {{unknown attribute 'arm_streaming_compatible' ignored}} +__attribute__((arm_streaming_compatible)) +int streaming_compatible_caller(void) { + return normal_callee(); +} + +// expected-warning@+1 {{unknown attribute 'arm_locally_streaming' ignored}} +__attribute__((arm_locally_streaming)) +int locally_streaming_caller(void) { + return normal_callee(); +} + +// expected-warning@+1 {{unknown attribute 'arm_shared_za' ignored}} +__attribute__((arm_shared_za)) +int shared_za_caller(void) { + return normal_callee(); +} + +// expected-warning@+1 {{unknown attribute 'arm_preserves_za' ignored}} +__attribute__((arm_preserves_za)) +int preserves_za_caller(void) { + return normal_callee(); +} + +// expected-warning@+1 {{unknown attribute 'arm_new_za' ignored}} +__attribute__((arm_new_za)) +int new_za_caller(void) { + return normal_callee(); +} diff --git a/clang/test/Sema/aarch64-sme-func-attrs.c b/clang/test/Sema/aarch64-sme-func-attrs.c new file mode 100644 --- /dev/null +++ b/clang/test/Sema/aarch64-sme-func-attrs.c @@ -0,0 +1,233 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -fsyntax-only -verify %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -fsyntax-only -verify=expected-cpp -x c++ %s + +// Valid attributes + +__attribute__((arm_streaming)) void sme_arm_streaming(void); +__attribute__((arm_streaming_compatible)) void sme_arm_streaming_compatible(void); + +__attribute__((arm_new_za)) void sme_arm_new_za(void) {} +__attribute__((arm_shared_za)) void sme_arm_shared_za(void); +__attribute__((arm_preserves_za)) void sme_arm_preserves_za(void); + +__attribute__((arm_streaming, arm_new_za)) void sme_arm_streaming_new_za(void) {} +__attribute__((arm_streaming, arm_shared_za)) void sme_arm_streaming_shared_za(void); +__attribute__((arm_streaming, arm_preserves_za)) void sme_arm_streaming_preserves_za(void); + +__attribute__((arm_streaming_compatible, arm_new_za)) void sme_arm_sc_new_za(void) {} +__attribute__((arm_streaming_compatible, arm_shared_za)) void sme_arm_sc_shared_za(void); +__attribute__((arm_streaming_compatible, arm_preserves_za)) void sme_arm_sc_preserves_za(void); + +__attribute__((arm_shared_za, arm_preserves_za)) void sme_arm_shared_preserves_za(void); + +__attribute__((arm_locally_streaming)) void sme_arm_locally_streaming(void) { } +__attribute__((arm_locally_streaming, arm_streaming)) void sme_arm_streaming_and_locally_streaming(void) { } +__attribute__((arm_locally_streaming, arm_streaming_compatible)) void sme_arm_streaming_and_streaming_compatible(void) { } + +__attribute__((arm_locally_streaming, arm_new_za)) void sme_arm_ls_new_za(void) { } +__attribute__((arm_locally_streaming, arm_shared_za)) void sme_arm_ls_shared_za(void) { } +__attribute__((arm_locally_streaming, arm_preserves_za)) void sme_arm_ls_preserves_za(void) { } + +// Valid attributes on function pointers + +void __attribute__((arm_streaming)) streaming_ptr(void); +typedef __attribute__((arm_streaming)) void (*fptrty1) (void); +fptrty1 call_streaming_func() { return streaming_ptr; } + +void __attribute__((arm_streaming_compatible)) streaming_compatible_ptr(void); +typedef __attribute__((arm_streaming_compatible)) void (*fptrty2) (void); +fptrty2 call_sc_func() { return streaming_compatible_ptr; } + +void __attribute__((arm_shared_za)) shared_za_ptr(void); +typedef __attribute__((arm_shared_za)) void (*fptrty3) (void); +fptrty3 call_shared_za_func() { return shared_za_ptr; } + +void __attribute__((arm_preserves_za)) preserves_za_ptr(void); +typedef __attribute__((arm_preserves_za)) void (*fptrty4) (void); +fptrty4 call_preserve_za_func() { return preserves_za_ptr; } + +void __attribute__((arm_shared_za, arm_preserves_za)) shared_preserves_za_ptr(void); +typedef __attribute__((arm_shared_za, arm_preserves_za)) void (*fptrty5) (void); +fptrty5 call_shared_preserve_za_func() { return shared_preserves_za_ptr; } + +typedef void (*fptrty6) (void); +fptrty6 cast_nza_func_to_normal() { return sme_arm_new_za; } +fptrty6 cast_ls_func_to_normal() { return sme_arm_locally_streaming; } + +// FIXME: Add invalid function pointer assignments such as assigning: +// 1. A streaming compatible function to a normal function pointer, +// 2. A locally streaming function to a streaming function pointer, +// etc. + +// Invalid attributes + +// expected-cpp-error@+4 {{'arm_streaming_compatible' and 'arm_streaming' attributes are not compatible}} +// expected-cpp-note@+3 {{conflicting attribute is here}} +// expected-error@+2 {{'arm_streaming_compatible' and 'arm_streaming' attributes are not compatible}} +// expected-note@+1 {{conflicting attribute is here}} +__attribute__((arm_streaming, arm_streaming_compatible)) void streaming_mode(void); + +// expected-cpp-error@+4 {{'arm_streaming' and 'arm_streaming_compatible' attributes are not compatible}} +// expected-cpp-note@+3 {{conflicting attribute is here}} +// expected-error@+2 {{'arm_streaming' and 'arm_streaming_compatible' attributes are not compatible}} +// expected-note@+1 {{conflicting attribute is here}} +__attribute__((arm_streaming_compatible, arm_streaming)) void streaming_compatible(void); + +// expected-cpp-error@+2 {{'arm_new_za' and 'arm_shared_za' attributes are not compatible}} +// expected-error@+1 {{'arm_new_za' and 'arm_shared_za' attributes are not compatible}} +__attribute__((arm_new_za, arm_shared_za)) void new_shared_za(void) {} + +// expected-cpp-error@+2 {{'arm_new_za' and 'arm_shared_za' attributes are not compatible}} +// expected-error@+1 {{'arm_new_za' and 'arm_shared_za' attributes are not compatible}} +__attribute__((arm_shared_za, arm_new_za)) void shared_new_za(void) {} + +// expected-cpp-error@+2 {{'arm_new_za' and 'arm_preserves_za' attributes are not compatible}} +// expected-error@+1 {{'arm_new_za' and 'arm_preserves_za' attributes are not compatible}} +__attribute__((arm_new_za, arm_preserves_za)) void new_preserves_za(void) {} + +// expected-cpp-error@+2 {{'arm_new_za' and 'arm_preserves_za' attributes are not compatible}} +// expected-error@+1 {{'arm_new_za' and 'arm_preserves_za' attributes are not compatible}} +__attribute__((arm_preserves_za, arm_new_za)) void preserves_new_za(void) {} + +// Invalid attributes on function pointers + +// expected-cpp-error@+4 {{'arm_streaming_compatible' and 'arm_streaming' attributes are not compatible}} +// expected-cpp-note@+3 {{conflicting attribute is here}} +// expected-error@+2 {{'arm_streaming_compatible' and 'arm_streaming' attributes are not compatible}} +// expected-note@+1 {{conflicting attribute is here}} +void __attribute__((arm_streaming, arm_streaming_compatible)) streaming_ptr_invalid(void); +// expected-cpp-error@+4 {{'arm_streaming_compatible' and 'arm_streaming' attributes are not compatible}} +// expected-cpp-note@+3 {{conflicting attribute is here}} +// expected-error@+2 {{'arm_streaming_compatible' and 'arm_streaming' attributes are not compatible}} +// expected-note@+1 {{conflicting attribute is here}} +typedef __attribute__((arm_streaming, arm_streaming_compatible)) void (*fptrty7) (void); +fptrty7 invalid_streaming_func() { return streaming_ptr_invalid; } + +// expected-cpp-error@+2 {{'arm_locally_streaming' attribute only applies to functions}} +// expected-error@+1 {{'arm_locally_streaming' attribute only applies to functions}} +typedef __attribute__((arm_locally_streaming)) void (*fptrty8) (void); + +// expected-warning@+2 {{'arm_streaming' attribute ignored}} +// expected-warning@+1 {{'arm_streaming' only applies to function types; type here is 'void ()'}} +__attribute__((arm_streaming)) void function_no_prototype(); + +// +// Check for incorrect conversions of function pointers with the attributes +// + +typedef void (*n_ptrty) (void); +typedef __attribute__((arm_streaming)) void (*s_ptrty) (void); +s_ptrty return_valid_streaming_fptr(s_ptrty f) { return f; } + +// expected-cpp-error@+2 {{cannot initialize return object of type 's_ptrty' (aka 'void (*)() __attribute__((arm_streaming))') with an lvalue of type 'n_ptrty' (aka 'void (*)()')}} +// expected-error@+1 {{incompatible function pointer types returning 'n_ptrty' (aka 'void (*)(void)') from a function with result type 's_ptrty' (aka 'void (*)(void) __attribute__((arm_streaming))')}} +s_ptrty return_invalid_fptr_streaming_normal(n_ptrty f) { return f; } +// expected-cpp-error@+2 {{cannot initialize return object of type 'n_ptrty' (aka 'void (*)()') with an lvalue of type 's_ptrty' (aka 'void (*)() __attribute__((arm_streaming))')}} +// expected-error@+1 {{incompatible function pointer types returning 's_ptrty' (aka 'void (*)(void) __attribute__((arm_streaming))') from a function with result type 'n_ptrty' (aka 'void (*)(void)')}} +n_ptrty return_invalid_fptr_normal_streaming(s_ptrty f) { return f; } + +// Test an instance where the result type is not a prototyped function, such that we still get a diagnostic. +typedef void (*nonproto_n_ptrty) (); +// expected-cpp-error@+2 {{cannot initialize return object of type 'nonproto_n_ptrty' (aka 'void (*)()') with an lvalue of type 's_ptrty' (aka 'void (*)() __attribute__((arm_streaming))')}} +// expected-error@+1 {{incompatible function pointer types returning 's_ptrty' (aka 'void (*)(void) __attribute__((arm_streaming))') from a function with result type 'nonproto_n_ptrty' (aka 'void (*)()')}} +nonproto_n_ptrty return_invalid_fptr_streaming_nonprotonormal(s_ptrty f) { return f; } + +typedef __attribute__((arm_streaming_compatible)) void (*sc_ptrty) (void); +sc_ptrty return_valid_streaming_compatible_fptr(sc_ptrty f) { return f; } + +// expected-cpp-error@+2 {{cannot initialize return object of type 'sc_ptrty' (aka 'void (*)() __attribute__((arm_streaming_compatible))') with an lvalue of type 'n_ptrty' (aka 'void (*)()')}} +// expected-error@+1 {{incompatible function pointer types returning 'n_ptrty' (aka 'void (*)(void)') from a function with result type 'sc_ptrty' (aka 'void (*)(void) __attribute__((arm_streaming_compatible))')}} +sc_ptrty return_invalid_fptr_streaming_compatible_normal(n_ptrty f) { return f; } +// expected-cpp-error@+2 {{cannot initialize return object of type 'n_ptrty' (aka 'void (*)()') with an lvalue of type 'sc_ptrty' (aka 'void (*)() __attribute__((arm_streaming_compatible))')}} +// expected-error@+1 {{incompatible function pointer types returning 'sc_ptrty' (aka 'void (*)(void) __attribute__((arm_streaming_compatible))') from a function with result type 'n_ptrty' (aka 'void (*)(void)')}} +n_ptrty return_invalid_fptr_normal_streaming_compatible(sc_ptrty f) { return f; } + +typedef __attribute__((arm_shared_za)) void (*sz_ptrty) (void); +sz_ptrty return_valid_shared_za_fptr(sz_ptrty f) { return f; } + + +// expected-cpp-error@+2 {{cannot initialize return object of type 'sz_ptrty' (aka 'void (*)() __attribute__((arm_shared_za))') with an lvalue of type 'n_ptrty' (aka 'void (*)()')}} +// expected-error@+1 {{incompatible function pointer types returning 'n_ptrty' (aka 'void (*)(void)') from a function with result type 'sz_ptrty' (aka 'void (*)(void) __attribute__((arm_shared_za))')}} +sz_ptrty return_invalid_fptr_shared_za_normal(n_ptrty f) { return f; } +// expected-cpp-error@+2 {{cannot initialize return object of type 'n_ptrty' (aka 'void (*)()') with an lvalue of type 'sz_ptrty' (aka 'void (*)() __attribute__((arm_shared_za))')}} +// expected-error@+1 {{incompatible function pointer types returning 'sz_ptrty' (aka 'void (*)(void) __attribute__((arm_shared_za))') from a function with result type 'n_ptrty' (aka 'void (*)(void)')}} +n_ptrty return_invalid_fptr_normal_shared_za(sz_ptrty f) { return f; } + +typedef __attribute__((arm_preserves_za)) void (*pz_ptrty) (void); +pz_ptrty return_valid_preserves_za_fptr(pz_ptrty f) { return f; } + +// expected-cpp-error@+2 {{cannot initialize return object of type 'pz_ptrty' (aka 'void (*)() __attribute__((arm_preserves_za))') with an lvalue of type 'n_ptrty' (aka 'void (*)()')}} +// expected-error@+1 {{incompatible function pointer types returning 'n_ptrty' (aka 'void (*)(void)') from a function with result type 'pz_ptrty' (aka 'void (*)(void) __attribute__((arm_preserves_za))')}} +pz_ptrty return_invalid_fptr_preserves_za_normal(n_ptrty f) { return f; } +// No diagnostics, the preserves_za hint should be dropped silently. +n_ptrty return_invalid_fptr_normal_preserves_za(pz_ptrty f) { return f; } + +// Test template instantiations +#ifdef __cplusplus +template __attribute__((arm_streaming)) T templated(T x) { return x; } +template <> __attribute__((arm_streaming)) int templated(int x) { return x + 1; } +template <> __attribute__((arm_streaming)) float templated(float x) { return x + 2; } +// expected-cpp-error@+2 {{explicit instantiation of 'templated' does not refer to a function template, variable template, member function, member class, or static data member}} +// expected-cpp-note@-4 {{candidate template ignored: could not match 'short (short) __attribute__((arm_streaming))' against 'short (short)'}} +template short templated(short); +#endif + +// Conflicting attributes on redeclarations + +// expected-error@+5 {{function declared ''void (void) __attribute__((arm_streaming_compatible))'' was previously declared ''void (void) __attribute__((arm_streaming))'' with different SME function attributes}} +// expected-note@+3 {{previous declaration is here}} +// expected-cpp-error@+3 {{function declared ''void () __attribute__((arm_streaming_compatible))'' was previously declared ''void () __attribute__((arm_streaming))'' with different SME function attributes}} +// expected-cpp-note@+1 {{previous declaration is here}} +__attribute__((arm_streaming)) void redecl(void); +__attribute__((arm_streaming_compatible)) void redecl(void) { } + +// expected-error@+5 {{function declared ''void (void) __attribute__((arm_shared_za))'' was previously declared ''void (void) __attribute__((arm_shared_za)) __attribute__((arm_preserves_za))'' with different SME function attributes}} +// expected-note@+3 {{previous declaration is here}} +// expected-cpp-error@+3 {{function declared ''void () __attribute__((arm_shared_za))'' was previously declared ''void () __attribute__((arm_shared_za)) __attribute__((arm_preserves_za))'' with different SME function attributes}} +// expected-cpp-note@+1 {{previous declaration is here}} +__attribute__((arm_shared_za, arm_preserves_za)) void redecl_nopreserve_za(void); +__attribute__((arm_shared_za)) void redecl_nopreserve_za(void) { } + +#ifdef __cplusplus +struct S { + virtual __attribute__((arm_shared_za)) void shared_za_memberfn(void); +}; + +struct S2 : public S { +// expected-cpp-error@+2 {{virtual function 'shared_za_memberfn' has different attributes ('void ()') than the function it overrides (which has 'void () __attribute__((arm_shared_za))')}} +// expected-cpp-note@-5 {{overridden virtual function is here}} + __attribute__((arm_new_za)) void shared_za_memberfn(void) override {} +}; + +// Check that the attribute propagates through template instantiations. +template +struct S3 { + static constexpr int value = 0; +}; + +template <> +struct S3 { + static constexpr int value = 1; +}; + +template <> +struct S3 { + static constexpr int value = 2; +}; + +void normal_func() {} +void streaming_func() __attribute__((arm_streaming)) {} + +static_assert(S3::value == 1, "why are we picking the wrong specialization?"); +static_assert(S3::value == 2, "why are we picking the wrong specialization?"); +#endif + +// expected-cpp-error@+2 {{'arm_streaming' attribute takes no arguments}} +// expected-error@+1 {{'arm_streaming' attribute takes no arguments}} +__attribute__((arm_streaming(0))) void invalid_streaming_args(void); + +// expected-cpp-error@+4 {{attribute only applies to non-K&R-style functions}} +// expected-cpp-warning@+3 {{'arm_streaming' only applies to function types; type here is 'int'}} +// expected-error@+2 {{attribute only applies to non-K&R-style functions}} +// expected-warning@+1 {{'arm_streaming' only applies to function types; type here is 'int'}} +__attribute__((arm_streaming)) int invalid_type_for_attribute; diff --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp new file mode 100644 --- /dev/null +++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp @@ -0,0 +1,559 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sve2 -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -fsyntax-only -verify %s + +// REQUIRES: aarch64-registered-target + +#include + +__attribute__((arm_streaming, arm_shared_za)) +void test_multivector_read(uint32_t base) { + + // Test Tile Range + svread_hor_za8_u8_vg2(1, base, 0); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + svread_ver_za8_u8_vg2(1, base, 0); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + svread_hor_za8_u8_vg4(1, base, 0); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + svread_ver_za8_u8_vg4(1, base, 0); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + + svread_hor_za16_u16_vg2(2, base, 0); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + svread_ver_za16_u16_vg2(2, base, 0); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + svread_hor_za16_u16_vg4(2, base, 0); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + svread_ver_za16_u16_vg4(2, base, 0); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + + svread_hor_za32_u32_vg2(4, base, 0); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svread_ver_za32_u32_vg2(4, base, 0); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svread_hor_za32_u32_vg4(4, base, 0); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svread_ver_za32_u32_vg4(4, base, 0); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + + svread_hor_za64_u64_vg2(8, base, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svread_ver_za64_u64_vg2(8, base, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svread_hor_za64_u64_vg4(8, base, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svread_ver_za64_u64_vg4(8, base, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // Test Offset Range + svread_hor_za8_u8_vg2(0, base, 13); // expected-error {{argument should be a multiple of 2}} + svread_hor_za8_u8_vg2(0, base, 16); // expected-error {{argument value 16 is outside the valid range [0, 14]}} + svread_ver_za8_u8_vg4(0, base, 11); // expected-error {{argument should be a multiple of 4}} + svread_ver_za8_u8_vg4(0, base, 16); // expected-error {{argument value 16 is outside the valid range [0, 12]}} + + svread_hor_za16_u16_vg2(0, base, 5); // expected-error {{argument should be a multiple of 2}} + svread_hor_za16_u16_vg2(0, base, 8); // expected-error {{argument value 8 is outside the valid range [0, 6]}} + svread_ver_za16_u16_vg2(0, base, 5); // expected-error {{argument should be a multiple of 2}} + svread_ver_za16_u16_vg2(0, base, 8); // expected-error {{argument value 8 is outside the valid range [0, 6]}} + + svread_hor_za16_u16_vg4(0, base, 3); // expected-error {{argument should be a multiple of 4}} + svread_hor_za16_u16_vg4(0, base, 8); // expected-error {{argument value 8 is outside the valid range [0, 4]}} + svread_ver_za16_u16_vg4(0, base, 3); // expected-error {{argument should be a multiple of 4}} + svread_ver_za16_u16_vg4(0, base, 8); // expected-error {{argument value 8 is outside the valid range [0, 4]}} + + svread_hor_za32_u32_vg2(0, base, 1); // expected-error {{argument should be a multiple of 2}} + svread_hor_za32_u32_vg2(0, base, 4); // expected-error {{argument value 4 is outside the valid range [0, 2]}} + svread_ver_za32_u32_vg2(0, base, 1); // expected-error {{argument should be a multiple of 2}} + svread_ver_za32_u32_vg2(0, base, 4); // expected-error {{argument value 4 is outside the valid range [0, 2]}} + + svread_hor_za32_u32_vg4(0, base, 4); // expected-error {{argument value 4 is outside the valid range [0, 0]}} + svread_hor_za64_u64_vg2(0, base, 2); // expected-error {{argument value 2 is outside the valid range [0, 0]}} + svread_hor_za64_u64_vg4(0, base, 4); // expected-error {{argument value 4 is outside the valid range [0, 0]}} + + svread_za64_u64_vg1x2(base, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svread_za64_u64_vg1x4(base, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_multivector_write(uint32_t base, svuint8x2_t v8x2, svuint8x4_t v8x4, + svuint16x2_t v16x2, svuint16x4_t v16x4, + svuint32x2_t v32x2, svuint32x4_t v32x4, + svuint64x2_t v64x2, svuint64x4_t v64x4) { + + // Test Tile Range + svwrite_hor_za8_u8_vg2(1, base, 0, v8x2); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + svwrite_ver_za8_u8_vg2(1, base, 0, v8x2); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + svwrite_hor_za8_u8_vg4(1, base, 0, v8x4); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + svwrite_ver_za8_u8_vg4(1, base, 0, v8x4); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + + svwrite_hor_za16_u16_vg2(2, base, 0, v16x2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + svwrite_ver_za16_u16_vg2(2, base, 0, v16x2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + svwrite_hor_za16_u16_vg4(2, base, 0, v16x4); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + svwrite_ver_za16_u16_vg4(2, base, 0, v16x4); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + + svwrite_hor_za32_u32_vg2(4, base, 0, v32x2); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svwrite_ver_za32_u32_vg2(4, base, 0, v32x2); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svwrite_hor_za32_u32_vg4(4, base, 0, v32x4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svwrite_ver_za32_u32_vg4(4, base, 0, v32x4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + + svwrite_hor_za64_u64_vg2(8, base, 0, v64x2); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svwrite_ver_za64_u64_vg2(8, base, 0, v64x2); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svwrite_hor_za64_u64_vg4(8, base, 0, v64x4); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svwrite_ver_za64_u64_vg4(8, base, 0, v64x4); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // Test Offset Range + svwrite_hor_za8_u8_vg2(0, base, 13, v8x2); // expected-error {{argument should be a multiple of 2}} + svwrite_hor_za8_u8_vg2(0, base, 16, v8x2); // expected-error {{argument value 16 is outside the valid range [0, 14]}} + svwrite_ver_za8_u8_vg4(0, base, 11, v8x4); // expected-error {{argument should be a multiple of 4}} + svwrite_ver_za8_u8_vg4(0, base, 16, v8x4); // expected-error {{argument value 16 is outside the valid range [0, 12]}} + + svwrite_hor_za16_u16_vg2(0, base, 5, v16x2); // expected-error {{argument should be a multiple of 2}} + svwrite_hor_za16_u16_vg2(0, base, 8, v16x2); // expected-error {{argument value 8 is outside the valid range [0, 6]}} + svwrite_ver_za16_u16_vg2(0, base, 5, v16x2); // expected-error {{argument should be a multiple of 2}} + svwrite_ver_za16_u16_vg2(0, base, 8, v16x2); // expected-error {{argument value 8 is outside the valid range [0, 6]}} + + svwrite_hor_za16_u16_vg4(0, base, 3, v16x4); // expected-error {{argument should be a multiple of 4}} + svwrite_hor_za16_u16_vg4(0, base, 8, v16x4); // expected-error {{argument value 8 is outside the valid range [0, 4]}} + svwrite_ver_za16_u16_vg4(0, base, 3, v16x4); // expected-error {{argument should be a multiple of 4}} + svwrite_ver_za16_u16_vg4(0, base, 8, v16x4); // expected-error {{argument value 8 is outside the valid range [0, 4]}} + + svwrite_hor_za32_u32_vg2(0, base, 1, v32x2); // expected-error {{argument should be a multiple of 2}} + svwrite_hor_za32_u32_vg2(0, base, 4, v32x2); // expected-error {{argument value 4 is outside the valid range [0, 2]}} + svwrite_ver_za32_u32_vg2(0, base, 1, v32x2); // expected-error {{argument should be a multiple of 2}} + svwrite_ver_za32_u32_vg2(0, base, 4, v32x2); // expected-error {{argument value 4 is outside the valid range [0, 2]}} + + svwrite_hor_za32_u32_vg4(0, base, 4, v32x4); // expected-error {{argument value 4 is outside the valid range [0, 0]}} + svwrite_hor_za64_u64_vg2(0, base, 2, v64x2); // expected-error {{argument value 2 is outside the valid range [0, 0]}} + svwrite_hor_za64_u64_vg4(0, base, 4, v64x4); // expected-error {{argument value 4 is outside the valid range [0, 0]}} + + svwrite_za64_u64_vg1x2(base, 8, v64x2); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svwrite_za64_u64_vg1x4(base, 8, v64x4); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_multiply_add_sub_long(uint32_t base, svint8_t s8, svuint8_t u8, + svint16_t s16, svuint16_t u16, svint8x2_t s8x2, + svuint8x2_t u8x2, svint16x2_t s16x2, svuint16x2_t u16x2, + svint8x4_t s8x4, svuint8x4_t u8x4, svint16x4_t s16x4, svuint16x4_t u16x4) { + svmla_single_za32_s8_vg4x1(base, 13, s8, s8); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + svmla_single_za32_u8_vg4x1(base, 13, u8, u8); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + svmla_single_za64_s16_vg4x1(base, 13, s16, s16); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + svmla_single_za64_u16_vg4x1(base, 13, u16, u16); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + + svmla_single_za32_s8_vg4x2(base, 5, s8x2, s8); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_single_za32_u8_vg4x2(base, 5, u8x2, u8); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_single_za64_s16_vg4x2(base, 5, s16x2, s16); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_single_za64_u16_vg4x2(base, 5, u16x2, u16); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + + svmla_single_za32_s8_vg4x4(base, 5, s8x4, s8); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_single_za32_u8_vg4x4(base, 5, u8x4, u8); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_single_za64_s16_vg4x4(base, 5, s16x4, s16); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_single_za64_u16_vg4x4(base, 5, u16x4, u16); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + + svmls_single_za32_s8_vg4x1(base, 13, s8, s8); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + svmls_single_za32_u8_vg4x1(base, 13, u8, u8); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + svmls_single_za64_s16_vg4x1(base, 13, s16, s16); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + svmls_single_za64_u16_vg4x1(base, 13, u16, u16); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + + svmls_single_za32_s8_vg4x2(base, 5, s8x2, s8); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_single_za32_u8_vg4x2(base, 5, u8x2, u8); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_single_za64_s16_vg4x2(base, 5, s16x2, s16); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_single_za64_u16_vg4x2(base, 5, u16x2, u16); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + + svmls_single_za32_s8_vg4x4(base, 5, s8x4, s8); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_single_za32_u8_vg4x4(base, 5, u8x4, u8); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_single_za64_s16_vg4x4(base, 5, s16x4, s16); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_single_za64_u16_vg4x4(base, 5, u16x4, u16); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + + svsumla_single_za32_u8_vg4x2(base, 5, s8x2, u8); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svsumla_single_za32_u8_vg4x4(base, 5, s8x4, u8); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + + svusmla_single_za32_s8_vg4x1(base, 13, u8, s8); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + svusmla_single_za32_s8_vg4x2(base, 5, u8x2, s8); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svusmla_single_za32_s8_vg4x4(base, 5, u8x4, s8); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + + svmla_za32_s8_vg4x2(base, 5, s8x2, s8x2); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_za32_u8_vg4x2(base, 5, u8x2, u8x2); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_za64_s16_vg4x2(base, 5, s16x2, s16x2); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_za64_u16_vg4x2(base, 5, u16x2, u16x2); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + + svmla_za32_s8_vg4x4(base, 5, s8x4, s8x4); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_za32_u8_vg4x4(base, 5, u8x4, u8x4); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_za64_s16_vg4x4(base, 5, s16x4, s16x4); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_za64_u16_vg4x4(base, 5, u16x4, u16x4); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + + svmls_za32_s8_vg4x2(base, 5, s8x2, s8x2); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_za32_u8_vg4x2(base, 5, u8x2, u8x2); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_za64_s16_vg4x2(base, 5, s16x2, s16x2); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_za64_u16_vg4x2(base, 5, u16x2, u16x2); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + + svmls_za32_s8_vg4x4(base, 5, s8x4, s8x4); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_za32_u8_vg4x4(base, 5, u8x4, u8x4); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_za64_s16_vg4x4(base, 5, s16x4, s16x4); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_za64_u16_vg4x4(base, 5, u16x4, u16x4); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + + svusmla_za32_s8_vg4x2(base, 5, u8x2, s8x2); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svusmla_za32_s8_vg4x4(base, 5, u8x4, s8x4); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + + svmla_lane_za32_s8_vg4x1(base, 13, s8, s8, 15); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + svmla_lane_za32_u8_vg4x1(base, 13, u8, u8, 15); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + svmla_lane_za64_s16_vg4x1(base, 13, s16, s16, 7); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + svmla_lane_za64_u16_vg4x1(base, 13, u16, u16, 7); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + + svmla_lane_za32_s8_vg4x2(base, 5, s8x2, s8, 15); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_lane_za32_u8_vg4x2(base, 5, u8x2, u8, 15); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_lane_za64_s16_vg4x2(base, 5, s16x2, s16, 7); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_lane_za64_u16_vg4x2(base, 5, u16x2, u16, 7); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + + svmla_lane_za32_s8_vg4x4(base, 5, s8x4, s8, 15); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_lane_za32_u8_vg4x4(base, 5, u8x4, u8, 15); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_lane_za64_s16_vg4x4(base, 5, s16x4, s16, 7); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_lane_za64_u16_vg4x4(base, 5, u16x4, u16, 7); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + + svmla_lane_za32_s8_vg4x1(base, 12, s8, s8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svmla_lane_za32_u8_vg4x1(base, 12, u8, u8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svmla_lane_za64_s16_vg4x1(base, 12, s16, s16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svmla_lane_za64_u16_vg4x1(base, 12, u16, u16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + svmla_lane_za32_s8_vg4x2(base, 4, s8x2, s8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svmla_lane_za32_u8_vg4x2(base, 4, u8x2, u8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svmla_lane_za64_s16_vg4x2(base, 4, s16x2, s16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svmla_lane_za64_u16_vg4x2(base, 4, u16x2, u16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + svmla_lane_za32_s8_vg4x4(base, 4, s8x4, s8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svmla_lane_za32_u8_vg4x4(base, 4, u8x4, u8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svmla_lane_za64_s16_vg4x4(base, 4, s16x4, s16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svmla_lane_za64_u16_vg4x4(base, 4, u16x4, u16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + svmls_lane_za32_s8_vg4x1(base, 13, s8, s8, 15); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + svmls_lane_za32_u8_vg4x1(base, 13, u8, u8, 15); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + svmls_lane_za64_s16_vg4x1(base, 13, s16, s16, 7); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + svmls_lane_za64_u16_vg4x1(base, 13, u16, u16, 7); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + + svmls_lane_za32_s8_vg4x2(base, 5, s8x2, s8, 15); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_lane_za32_u8_vg4x2(base, 5, u8x2, u8, 15); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_lane_za64_s16_vg4x2(base, 5, s16x2, s16, 7); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_lane_za64_u16_vg4x2(base, 5, u16x2, u16, 7); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + + svmls_lane_za32_s8_vg4x4(base, 5, s8x4, s8, 15); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_lane_za32_u8_vg4x4(base, 5, u8x4, u8, 15); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_lane_za64_s16_vg4x4(base, 5, s16x4, s16, 7); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_lane_za64_u16_vg4x4(base, 5, u16x4, u16, 7); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + + svmls_lane_za32_s8_vg4x1(base, 12, s8, s8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svmls_lane_za32_u8_vg4x1(base, 12, u8, u8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svmls_lane_za64_s16_vg4x1(base, 12, s16, s16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svmls_lane_za64_u16_vg4x1(base, 12, u16, u16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + svmls_lane_za32_s8_vg4x2(base, 4, s8x2, s8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svmls_lane_za32_u8_vg4x2(base, 4, u8x2, u8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svmls_lane_za64_s16_vg4x2(base, 4, s16x2, s16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svmls_lane_za64_u16_vg4x2(base, 4, u16x2, u16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + svmls_lane_za32_s8_vg4x4(base, 4, s8x4, s8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svmls_lane_za32_u8_vg4x4(base, 4, u8x4, u8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svmls_lane_za64_s16_vg4x4(base, 4, s16x4, s16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svmls_lane_za64_u16_vg4x4(base, 4, u16x4, u16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + svsumla_lane_za32_s8_vg4x1(base, 13, s8, u8, 15); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + svsumla_lane_za32_s8_vg4x2(base, 5, s8x2, u8, 15); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svsumla_lane_za32_s8_vg4x4(base, 5, s8x4, u8, 15); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + + svsumla_lane_za32_s8_vg4x1(base, 12, s8, u8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svsumla_lane_za32_s8_vg4x2(base, 4, s8x2, u8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svsumla_lane_za32_s8_vg4x4(base, 4, s8x4, u8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + + svusmla_lane_za32_u8_vg4x1(base, 13, u8, s8, 15); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + svusmla_lane_za32_u8_vg4x2(base, 5, u8x2, s8, 15); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svusmla_lane_za32_u8_vg4x4(base, 5, u8x4, s8, 15); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + + svusmla_lane_za32_u8_vg4x1(base, 12, u8, s8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svusmla_lane_za32_u8_vg4x2(base, 4, u8x2, s8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svusmla_lane_za32_u8_vg4x4(base, 4, u8x4, s8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_vertical_dot_product(uint32_t base, svint16x2_t s16x2, svuint16x2_t u16x2, + svint8x4_t s8x4, svuint8x4_t u8x4, + svint16x4_t s16x4, svuint16x4_t u16x4, + svfloat16x2_t f16x2, svbfloat16x2_t bf16x2, + svint16_t s16, svuint16_t u16, + svint8_t s8, svuint8_t u8, + svfloat16_t f16, svbfloat16_t b16) { + // Test slice offset values. + svvdot_lane_za32_s16_vg1x2(base, 8, s16x2, s16, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svvdot_lane_za32_u16_vg1x2(base, 8, u16x2, u16, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svvdot_lane_za32_s8_vg1x4(base, 8, s8x4, s8, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svvdot_lane_za32_u8_vg1x4(base, 8, u8x4, u8, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svvdot_lane_za64_s16_vg1x4(base, 8, s16x4, s16, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svvdot_lane_za64_u16_vg1x4(base, 8, u16x4, u16, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svvdot_lane_za32_f16_vg1x2(base, 8, f16x2, f16, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svvdot_lane_za32_bf16_vg1x2(base, 8, bf16x2, b16, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svsuvdot_lane_za32_s8_vg1x4(base, 8, s8x4, s8, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svusvdot_lane_za32_u8_vg1x4(base, 8, u8x4, u8, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // Test lane indices. + svvdot_lane_za32_s16_vg1x2(base, 7, s16x2, s16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svvdot_lane_za32_u16_vg1x2(base, 7, u16x2, u16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svvdot_lane_za32_s8_vg1x4(base, 7, s8x4, s8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svvdot_lane_za32_u8_vg1x4(base, 7, u8x4, u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svvdot_lane_za64_s16_vg1x4(base, 7, s16x4, s16, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + svvdot_lane_za64_u16_vg1x4(base, 7, u16x4, u16, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + svvdot_lane_za32_f16_vg1x2(base, 7, f16x2, f16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svvdot_lane_za32_bf16_vg1x2(base, 7, bf16x2, b16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svsuvdot_lane_za32_s8_vg1x4(base, 7, s8x4, s8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svusvdot_lane_za32_u8_vg1x4(base, 7, u8x4, u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_fdot_za32_bad_slice(uint32_t slice_base, svfloat16_t z_f16, + svfloat16x2_t z_f16x2, svfloat16x4_t z_f16x4, + svbfloat16_t z_bf16, svbfloat16x2_t z_bf16x2, + svbfloat16x4_t z_bf16x4) { + + // 16-bit float + svdot_za32_f16_vg1x2(slice_base, 8, z_f16x2, z_f16x2); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_za32_f16_vg1x4(slice_base, 8, z_f16x4, z_f16x4); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_single_za32_f16_vg1x2(slice_base, 8, z_f16x2, z_f16); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_single_za32_f16_vg1x4(slice_base, 8, z_f16x4, z_f16); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_lane_za32_f16_vg1x2(slice_base, 8, z_f16x2, z_f16, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_lane_za32_f16_vg1x4(slice_base, 8, z_f16x4, z_f16, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // 16-bit binary float + svdot_za32_bf16_vg1x2(slice_base, 8, z_bf16x2, z_bf16x2); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_za32_bf16_vg1x4(slice_base, 8, z_bf16x4, z_bf16x4); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_single_za32_bf16_vg1x2(slice_base, 8, z_bf16x2, z_bf16); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_single_za32_bf16_vg1x4(slice_base, 8, z_bf16x4, z_bf16); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_lane_za32_bf16_vg1x2(slice_base, 8, z_bf16x2, z_bf16, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_lane_za32_bf16_vg1x4(slice_base, 8, z_bf16x4, z_bf16, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_fdot_za32_bad_lane(uint32_t slice_base, svfloat16_t z_f16, + svfloat16x2_t z_f16x2, svfloat16x4_t z_f16x4, + svbfloat16_t z_bf16, svbfloat16x2_t z_bf16x2, + svbfloat16x4_t z_bf16x4) { + // 16-bit float + svdot_lane_za32_f16_vg1x2(slice_base, 7, z_f16x2, z_f16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svdot_lane_za32_f16_vg1x4(slice_base, 7, z_f16x4, z_f16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + + // 16-bit binary float + svdot_lane_za32_bf16_vg1x2(slice_base, 7, z_bf16x2, z_bf16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svdot_lane_za32_bf16_vg1x4(slice_base, 7, z_bf16x4, z_bf16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svdot_multi_za32_bad_slice(uint32_t slice_base, svuint16_t z_u16, + svuint16x2_t z_u16x2, svuint16x4_t z_u16x4, + svint16_t z_s16, svint16x2_t z_s16x2, + svint16x4_t z_s16x4, svuint8_t z_u8, + svuint8x2_t z_u8x2, svuint8x4_t z_u8x4, + svint8_t z_s8, svint8x2_t z_s8x2, + svint8x4_t z_s8x4) { + // Multi, multi (unsigned) + svdot_za32_u16_vg1x2(slice_base, 8, z_u16x2, z_u16x2); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_za32_u16_vg1x4(slice_base, 8, z_u16x4, z_u16x4); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_za32_u8_vg1x2(slice_base, 8, z_u8x2, z_u8x2); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_za32_u8_vg1x4(slice_base, 8, z_u8x4, z_u8x4); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_za64_u16_vg1x2(slice_base, 8, z_u16x2, z_u16x2); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_za64_u16_vg1x4(slice_base, 8, z_u16x4, z_u16x4); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // Multi, multi (signed) + svdot_za32_s16_vg1x2(slice_base, 8, z_s16x2, z_s16x2); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_za32_s16_vg1x4(slice_base, 8, z_s16x4, z_s16x4); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_za32_s8_vg1x2(slice_base, 8, z_s8x2, z_s8x2); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_za32_s8_vg1x4(slice_base, 8, z_s8x4, z_s8x4); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_za64_s16_vg1x2(slice_base, 8, z_s16x2, z_s16x2); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_za64_s16_vg1x4(slice_base, 8, z_s16x4, z_s16x4); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // Multi, single (unsigned) + svdot_single_za32_u16_vg1x2(slice_base, 8, z_u16x2, z_u16); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_single_za32_u16_vg1x4(slice_base, 8, z_u16x4, z_u16); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_single_za32_u8_vg1x2(slice_base, 8, z_u8x2, z_u8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_single_za32_u8_vg1x4(slice_base, 8, z_u8x4, z_u8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_single_za64_u16_vg1x2(slice_base, 8, z_u16x2, z_u16); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_single_za64_u16_vg1x4(slice_base, 8, z_u16x4, z_u16); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // Multi, single (signed) + svdot_single_za32_s16_vg1x2(slice_base, 8, z_s16x2, z_s16); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_single_za32_s16_vg1x4(slice_base, 8, z_s16x4, z_s16); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_single_za32_s8_vg1x2(slice_base, 8, z_s8x2, z_s8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_single_za32_s8_vg1x4(slice_base, 8, z_s8x4, z_s8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_single_za64_s16_vg1x2(slice_base, 8, z_s16x2, z_s16); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_single_za64_s16_vg1x4(slice_base, 8, z_s16x4, z_s16); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // Multi, indexed (unsigned) + svdot_lane_za32_u16_vg1x2(slice_base, 8, z_u16x2, z_u16, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_lane_za32_u16_vg1x4(slice_base, 8, z_u16x4, z_u16, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_lane_za32_u8_vg1x2(slice_base, 8, z_u8x2, z_u8, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_lane_za32_u8_vg1x4(slice_base, 8, z_u8x4, z_u8, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_lane_za64_u16_vg1x2(slice_base, 8, z_u16x2, z_u16, 1); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_lane_za64_u16_vg1x4(slice_base, 8, z_u16x4, z_u16, 1); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // Multi, indexed (signed) + svdot_lane_za32_s16_vg1x2(slice_base, 8, z_s16x2, z_s16, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_lane_za32_s16_vg1x4(slice_base, 8, z_s16x4, z_s16, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_lane_za32_s8_vg1x2(slice_base, 8, z_s8x2, z_s8, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_lane_za32_s8_vg1x4(slice_base, 8, z_s8x4, z_s8, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_lane_za64_s16_vg1x2(slice_base, 8, z_s16x2, z_s16, 1); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_lane_za64_s16_vg1x4(slice_base, 8, z_s16x4, z_s16, 1); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // Multi, multi (unsigned by signed) + svusdot_za32_u8_vg1x2(slice_base, 8, z_u8x2, z_s8x2); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svusdot_za32_u8_vg1x4(slice_base, 8, z_u8x4, z_s8x4); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // Multi, single (unsigned by signed) + svusdot_single_za32_u8_vg1x2(slice_base, 8, z_u8x2, z_s8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svusdot_single_za32_u8_vg1x4(slice_base, 8, z_u8x4, z_s8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // Multi, indexed (unsigned by signed) + svusdot_lane_za32_u8_vg1x2(slice_base, 8, z_u8x2, z_s8, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svusdot_lane_za32_u8_vg1x4(slice_base, 8, z_u8x4, z_s8, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // Multi, single (signed by unsigned) + svsudot_single_za32_s8_vg1x2(slice_base, 8, z_s8x2, z_u8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svsudot_single_za32_s8_vg1x4(slice_base, 8, z_s8x4, z_u8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // Multi, indexed (unsigned by signed) + svsudot_lane_za32_s8_vg1x2(slice_base, 8, z_s8x2, z_u8, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svsudot_lane_za32_s8_vg1x4(slice_base, 8, z_s8x4, z_u8, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + + +__attribute__((arm_streaming, arm_shared_za)) +void test_svdot_multi_za32_bad_lane(uint32_t slice_base, svuint16_t z_u16, + svuint16x2_t z_u16x2, svuint16x4_t z_u16x4, + svint16_t z_s16, svint16x2_t z_s16x2, + svint16x4_t z_s16x4, svuint8_t z_u8, + svuint8x2_t z_u8x2, svuint8x4_t z_u8x4, + svint8_t z_s8, svint8x2_t z_s8x2, + svint8x4_t z_s8x4) { + // Multi, indexed (unsigned) + svdot_lane_za32_u16_vg1x2(slice_base, 7, z_u16x2, z_u16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svdot_lane_za32_u16_vg1x4(slice_base, 7, z_u16x4, z_u16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svdot_lane_za32_u8_vg1x2(slice_base, 7, z_u8x2, z_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svdot_lane_za32_u8_vg1x4(slice_base, 7, z_u8x4, z_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svdot_lane_za64_u16_vg1x2(slice_base, 7, z_u16x2, z_u16, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + svdot_lane_za64_u16_vg1x4(slice_base, 7, z_u16x4, z_u16, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + + // Multi, indexed (signed) + svdot_lane_za32_s16_vg1x2(slice_base, 7, z_s16x2, z_s16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svdot_lane_za32_s16_vg1x4(slice_base, 7, z_s16x4, z_s16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svdot_lane_za32_s8_vg1x2(slice_base, 7, z_s8x2, z_s8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svdot_lane_za32_s8_vg1x4(slice_base, 7, z_s8x4, z_s8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svdot_lane_za64_s16_vg1x2(slice_base, 7, z_s16x2, z_s16, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + svdot_lane_za64_s16_vg1x4(slice_base, 7, z_s16x4, z_s16, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + + // Multi, indexed (unsigned by signed) + svusdot_lane_za32_u8_vg1x2(slice_base, 7, z_u8x2, z_s8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svusdot_lane_za32_u8_vg1x4(slice_base, 7, z_u8x4, z_s8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + + // Multi, indexed (unsigned by signed) + svsudot_lane_za32_s8_vg1x2(slice_base, 7, z_s8x2, z_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svsudot_lane_za32_s8_vg1x4(slice_base, 7, z_s8x4, z_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} +} + +__attribute__((arm_streaming_compatible)) +void test_bfmlslb_bad_lane(svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm) { + svbfmlslb_lane_f32(zda, zn, zm, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svbfmlslt_lane_f32(zda, zn, zm, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + +__attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za)) +void test_ldr_str_zt(const void *const_base, void *base) { + svldr_zt(1, const_base); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + svstr_zt(1, base); // expected-error {{argument value 1 is outside the valid range [0, 0]}} +} + +__attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za)) +void test_zero_zt() { + // Test Reg Offset + svzero_zt(1); // expected-error {{argument value 1 is outside the valid range [0, 0]}} +} + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) +void test_svluti2_lane_zt(svuint8_t zn_u8, svuint16_t zn_u16, svuint32_t zn_u32) { + // Test Reg Offset + svluti2_lane_zt_u8(1, zn_u8, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + // Test index value range + svluti2_lane_zt_u8(0, zn_u8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + + // Test Reg Offset + svluti2_lane_zt_u16(1, zn_u16, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + // Test index value range + svluti2_lane_zt_u16(0, zn_u16, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + + // Test Reg Offset + svluti2_lane_zt_u32(1, zn_u32, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + // Test index value range + svluti2_lane_zt_u32(0, zn_u32, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} +} + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) +void test_svluti4_lane_zt(svuint8_t zn_u8, svuint16_t zn_u16, svuint32_t zn_u32) { + // Test Reg Offset + svluti4_lane_zt_u8(1, zn_u8, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + // Test index value range + svluti4_lane_zt_u8(0, zn_u8, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // Test Reg Offset + svluti4_lane_zt_u16(1, zn_u16, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + // Test index value range + svluti4_lane_zt_u16(0, zn_u16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // Test Reg Offset + svluti4_lane_zt_u32(1, zn_u32, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + // Test index value range + svluti4_lane_zt_u32(0, zn_u32, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) +void test_svluti2_lane_zt_x2(svuint8_t zn_u8, svuint16_t zn_u16, svuint32_t zn_u32) { + // Test Reg Offset + svluti2_lane_zt_u8_x2(1, zn_u8, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + // Test index value range + svluti2_lane_zt_u8_x2(0, zn_u8, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // Test Reg Offset + svluti2_lane_zt_u16_x2(1, zn_u16, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + // Test index value range + svluti2_lane_zt_u16_x2(0, zn_u16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // Test Reg Offset + svluti2_lane_zt_u32_x2(1, zn_u32, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + // Test index value range + svluti2_lane_zt_u32_x2(0, zn_u32, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) +void test_svluti4_lane_zt_x2(svuint8_t zn_u8, svuint16_t zn_u16, svuint32_t zn_u32) { + // Test Reg Offset + svluti4_lane_zt_u8_x2(1, zn_u8, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + // Test index value range + svluti4_lane_zt_u8_x2(0, zn_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + + // Test Reg Offset + svluti4_lane_zt_u16_x2(1, zn_u16, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + // Test index value range + svluti4_lane_zt_u16_x2(0, zn_u16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + + // Test Reg Offset + svluti4_lane_zt_u32_x2(1, zn_u32, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + // Test index value range + svluti4_lane_zt_u32_x2(0, zn_u32, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} +} + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) +void test_svluti2_lane_zt_x4(svuint8_t zn_u8, svuint16_t zn_u16, svuint32_t zn_u32) { + // Test Reg Offset + svluti2_lane_zt_u8_x4(1, zn_u8, 0); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + // Test index value range + svluti2_lane_zt_u8_x4(0, zn_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + + // Test Reg Offset + svluti2_lane_zt_u16_x4(1, zn_u16, 3); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + // Test index value range + svluti2_lane_zt_u16_x4(0, zn_u16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + + // Test Reg Offset + svluti2_lane_zt_u32_x4(1, zn_u32, 3); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + // Test index value range + svluti2_lane_zt_u32_x4(0, zn_u32, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} +} + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) +void test_svluti4_lane_zt_x4(svuint8_t zn_u8, svuint16_t zn_u16, svuint32_t zn_u32) { + // Test Reg Offset + svluti4_lane_zt_u16_x4(1, zn_u16, 0); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + // Test index value range + svluti4_lane_zt_u16_x4(0, zn_u16, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + + // Test Reg Offset + svluti4_lane_zt_u32_x4(1, zn_u32, 1); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + // Test index value range + svluti4_lane_zt_u32_x4(0, zn_u32, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} +} diff --git a/clang/test/Sema/aarch64-sve2p1-intrinsics/acle_sve2p1_imm.cpp b/clang/test/Sema/aarch64-sve2p1-intrinsics/acle_sve2p1_imm.cpp new file mode 100644 --- /dev/null +++ b/clang/test/Sema/aarch64-sve2p1-intrinsics/acle_sve2p1_imm.cpp @@ -0,0 +1,230 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +sme2 -fsyntax-only -verify %s + +// REQUIRES: aarch64-registered-target + +#include + +__attribute__((arm_streaming)) +void test_svpext_lane_imm_0_3(svcount_t c) { + svpext_lane_c8(c, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svpext_lane_c16(c, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svpext_lane_c32(c, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svpext_lane_c64(c, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + + svpext_lane_c8(c, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svpext_lane_c16(c, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svpext_lane_c32(c, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svpext_lane_c64(c, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} +} + +__attribute__((arm_streaming)) +void test_svpext_lane_x2_imm_0_1(svcount_t c) { + svpext_lane_c8_x2(c, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + svpext_lane_c16_x2(c, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + svpext_lane_c32_x2(c, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + svpext_lane_c64_x2(c, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + + svpext_lane_c8_x2(c, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + svpext_lane_c16_x2(c, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + svpext_lane_c32_x2(c, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + svpext_lane_c64_x2(c, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} +} + +__attribute__((arm_streaming)) +void test_svpsel_lane_imm() { + svpsel_lane_b8(svptrue_b8(), svptrue_b8(), 0, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 15]}} + svpsel_lane_b16(svptrue_b16(), svptrue_b16(), 0, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svpsel_lane_b32(svptrue_b32(), svptrue_b32(), 0, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svpsel_lane_b64(svptrue_b64(), svptrue_b64(), 0, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + + svpsel_lane_b8(svptrue_b8(), svptrue_b8(), 0, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svpsel_lane_b16(svptrue_b16(), svptrue_b16(), 0, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svpsel_lane_b32(svptrue_b32(), svptrue_b32(), 0, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svpsel_lane_b64(svptrue_b64(), svptrue_b64(), 0, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + + svpsel_lane_c8(svptrue_c8(), svptrue_b8(), 0, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 15]}} + svpsel_lane_c16(svptrue_c16(), svptrue_b16(), 0, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svpsel_lane_c32(svptrue_c32(), svptrue_b32(), 0, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svpsel_lane_c64(svptrue_c64(), svptrue_b64(), 0, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + + svpsel_lane_c8(svptrue_c8(), svptrue_b8(), 0, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svpsel_lane_c16(svptrue_c16(), svptrue_b16(), 0, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svpsel_lane_c32(svptrue_c32(), svptrue_b32(), 0, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svpsel_lane_c64(svptrue_c64(), svptrue_b64(), 0, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} +} + +__attribute__((arm_streaming)) +svcount_t test_svwhile_pn(int64_t op1, int64_t op2) { + svwhilege_c8(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilege_c16(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilege_c32(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilege_c64(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilegt_c8(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilegt_c16(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilegt_c32(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilegt_c64(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilehi_c8(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilehi_c16(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilehi_c32(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilehi_c64(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilehs_c8(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilehs_c16(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilehs_c32(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilehs_c64(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilele_c8(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilele_c16(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilele_c32(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilele_c64(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilelo_c8(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilelo_c16(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilelo_c32(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilelo_c64(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilels_c8(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilels_c16(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilels_c32(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilels_c64(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilelt_c8(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilelt_c16(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilelt_c32(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilelt_c64(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + + svwhilege_c8(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilege_c16(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilege_c32(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilege_c64(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilegt_c8(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilegt_c16(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilegt_c32(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilegt_c64(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilehi_c8(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilehi_c16(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilehi_c32(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilehi_c64(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilehs_c8(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilehs_c16(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilehs_c32(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilehs_c64(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilele_c8(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilele_c16(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilele_c32(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilele_c64(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilelo_c8(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilelo_c16(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilelo_c32(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilelo_c64(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilels_c8(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilels_c16(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilels_c32(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilels_c64(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilelt_c8(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilelt_c16(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilelt_c32(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilelt_c64(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} +} + +__attribute__((arm_streaming)) +void test_svqrshr_x2(svint32x2_t zn_x2, svuint32x2_t zn_ux2) { + svqrshr_s16_x2(zn_x2, 0); // expected-error {{argument value 0 is outside the valid range [1, 16]}} + svqrshr_u16_x2(zn_ux2, 0); // expected-error {{argument value 0 is outside the valid range [1, 16]}} + + svqrshr_s16_x2(zn_x2, 17); // expected-error {{argument value 17 is outside the valid range [1, 16]}} + svqrshr_u16_x2(zn_ux2, 17); // expected-error {{argument value 17 is outside the valid range [1, 16]}} +} + +__attribute__((arm_streaming)) +void test_svqrshr_32x4(svint32x4_t zn_x4, svuint32x4_t zn_ux4) { + svqrshr_s8_x4(zn_x4, 0); // expected-error {{argument value 0 is outside the valid range [1, 32]}} + svqrshr_u8_x4(zn_ux4, 0); // expected-error {{argument value 0 is outside the valid range [1, 32]}} + + svqrshr_s8_x4(zn_x4, 33); // expected-error {{argument value 33 is outside the valid range [1, 32]}} + svqrshr_u8_x4(zn_ux4, 33); // expected-error {{argument value 33 is outside the valid range [1, 32]}} +} + +__attribute__((arm_streaming)) +void test_svqrshr_64x4(svint64x4_t zn_x4, svuint64x4_t zn_ux4) { + svqrshr_s16_x4(zn_x4, 0); // expected-error {{argument value 0 is outside the valid range [1, 64]}} + svqrshr_u16_x4(zn_ux4, 0); // expected-error {{argument value 0 is outside the valid range [1, 64]}} + + svqrshr_s16_x4(zn_x4, 65); // expected-error {{argument value 65 is outside the valid range [1, 64]}} + svqrshr_u16_x4(zn_ux4, 65); // expected-error {{argument value 65 is outside the valid range [1, 64]}} +} + +__attribute__((arm_streaming_compatible)) +void test_svqrshrn_x2(svint32x2_t zn_x2, svuint32x2_t zn_ux2) { + svqrshrn_s16_x2(zn_x2, 0); // expected-error {{argument value 0 is outside the valid range [1, 16]}} + svqrshrn_u16_x2(zn_ux2, 0); // expected-error {{argument value 0 is outside the valid range [1, 16]}} + + svqrshrn_s16_x2(zn_x2, 17); // expected-error {{argument value 17 is outside the valid range [1, 16]}} + svqrshrn_u16_x2(zn_ux2, 17); // expected-error {{argument value 17 is outside the valid range [1, 16]}} +} + +__attribute__((arm_streaming)) +void test_svqrshrn_32x4(svint32x4_t zn_x4, svuint32x4_t zn_ux4) { + svqrshrn_s8_x4(zn_x4, 0); // expected-error {{argument value 0 is outside the valid range [1, 32]}} + svqrshrn_u8_x4(zn_ux4, 0); // expected-error {{argument value 0 is outside the valid range [1, 32]}} + + svqrshrn_s8_x4(zn_x4, 33); // expected-error {{argument value 33 is outside the valid range [1, 32]}} + svqrshrn_u8_x4(zn_ux4, 33); // expected-error {{argument value 33 is outside the valid range [1, 32]}} +} + +__attribute__((arm_streaming)) +void test_svqrshrn_64x4(svint64x4_t zn_x4, svuint64x4_t zn_ux4) { + svqrshrn_s16_x4(zn_x4, 0); // expected-error {{argument value 0 is outside the valid range [1, 64]}} + svqrshrn_u16_x4(zn_ux4, 0); // expected-error {{argument value 0 is outside the valid range [1, 64]}} + + svqrshrn_s16_x4(zn_x4, 65); // expected-error {{argument value 65 is outside the valid range [1, 64]}} + svqrshrn_u16_x4(zn_ux4, 65); // expected-error {{argument value 65 is outside the valid range [1, 64]}} +} + +__attribute__((arm_streaming)) +void test_svqrshru(svint32x2_t zn_x2, svint32x4_t zn_32x4, svint64x4_t zn_64x4) { + svsqrshru_u16_x2(zn_x2, 0); // expected-error {{argument value 0 is outside the valid range [1, 16]}} + svsqrshru_u8_x4(zn_32x4, 0); // expected-error {{argument value 0 is outside the valid range [1, 32]}} + svsqrshru_u16_x4(zn_64x4, 0); // expected-error {{argument value 0 is outside the valid range [1, 64]}} + + svsqrshru_u16_x2(zn_x2, 17); // expected-error {{argument value 17 is outside the valid range [1, 16]}} + svsqrshru_u8_x4(zn_32x4, 33); // expected-error {{argument value 33 is outside the valid range [1, 32]}} + svsqrshru_u16_x4(zn_64x4, 65); // expected-error {{argument value 65 is outside the valid range [1, 64]}} +} + +__attribute__((arm_streaming_compatible)) +void test_svqrshrun_x2(svint32x2_t zn_x2) { + svsqrshrun_u16_x2(zn_x2, 0); // expected-error {{argument value 0 is outside the valid range [1, 16]}} + + svsqrshrun_u16_x2(zn_x2, 17); // expected-error {{argument value 17 is outside the valid range [1, 16]}} +} + +__attribute__((arm_streaming)) +void test_svqrshrun_32x4(svint32x4_t zn_x4, svuint32x4_t zn_ux4) { + svsqrshrun_u8_x4(zn_x4, 0); // expected-error {{argument value 0 is outside the valid range [1, 32]}} + + svsqrshrun_u8_x4(zn_x4, 33); // expected-error {{argument value 33 is outside the valid range [1, 32]}} +} + +__attribute__((arm_streaming)) +void test_svqrshrun_64x4(svint64x4_t zn_x4, svuint64x4_t zn_ux4) { + svsqrshrun_u16_x4(zn_x4, 0); // expected-error {{argument value 0 is outside the valid range [1, 64]}} + + svsqrshrun_u16_x4(zn_x4, 65); // expected-error {{argument value 65 is outside the valid range [1, 64]}} +} + +__attribute__((arm_streaming)) +void test_cntp(svcount_t c) { + svcntp_c8(c, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svcntp_c16(c, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svcntp_c32(c, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svcntp_c64(c, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + + svcntp_c8(c, 3); // expected-error {{argument should be a multiple of 2}} + svcntp_c16(c, 3); // expected-error {{argument should be a multiple of 2}} + svcntp_c32(c, 3); // expected-error {{argument should be a multiple of 2}} + svcntp_c64(c, 3); // expected-error {{argument should be a multiple of 2}} +} + +__attribute__((arm_streaming_compatible)) +void test_svdot_lane_2way(svint32_t s32, svuint32_t u32, svint16_t s16, svuint16_t u16, + svfloat32_t f32, svfloat16_t f16) { + svdot_lane_s32_s16_s16(s32, s16, s16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svdot_lane_u32_u16_u16(u32, u16, u16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svdot_lane_f32_f16_f16(f32, f16, f16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} +} diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp --- a/clang/utils/TableGen/NeonEmitter.cpp +++ b/clang/utils/TableGen/NeonEmitter.cpp @@ -552,6 +552,8 @@ void genBuiltinsDef(raw_ostream &OS, SmallVectorImpl &Defs); void genOverloadTypeCheckCode(raw_ostream &OS, SmallVectorImpl &Defs); + void genStreamingSVECompatibleList(raw_ostream &OS, + SmallVectorImpl &Defs); void genIntrinsicRangeCheckCode(raw_ostream &OS, SmallVectorImpl &Defs); @@ -2039,6 +2041,30 @@ OS << "#endif\n\n"; } +void NeonEmitter::genStreamingSVECompatibleList( + raw_ostream &OS, SmallVectorImpl &Defs) { + OS << "#ifdef GET_NEON_STREAMING_COMPAT_FLAG\n"; + + std::set Emitted; + for (auto *Def : Defs) { + // If the def has a body (that is, it has Operation DAGs), it won't call + // __builtin_neon_* so we don't need to generate a definition for it. + if (Def->hasBody()) + continue; + + std::string Name = Def->getMangledName(); + if (Emitted.find(Name) != Emitted.end()) + continue; + + // FIXME: We should make exceptions here for some NEON builtins that are + // permitted in streaming mode. + OS << "case NEON::BI__builtin_neon_" << Name + << ": BuiltinType = ArmNonStreaming; break;\n"; + Emitted.insert(Name); + } + OS << "#endif\n\n"; +} + /// Generate the ARM and AArch64 overloaded type checking code for /// SemaChecking.cpp, checking for unique builtin declarations. void NeonEmitter::genOverloadTypeCheckCode(raw_ostream &OS, @@ -2221,6 +2247,8 @@ // Generate ARM overloaded type checking code for SemaChecking.cpp genOverloadTypeCheckCode(OS, Defs); + genStreamingSVECompatibleList(OS, Defs); + // Generate ARM range checking code for shift/lane immediates. genIntrinsicRangeCheckCode(OS, Defs); } diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp --- a/clang/utils/TableGen/SveEmitter.cpp +++ b/clang/utils/TableGen/SveEmitter.cpp @@ -43,6 +43,8 @@ ClassG, // Overloaded name without type suffix }; +enum class ACLEKind { SVE, SME }; + using TypeSpec = std::string; namespace { @@ -66,17 +68,19 @@ class SVEType { TypeSpec TS; bool Float, Signed, Immediate, Void, Constant, Pointer, BFloat; - bool DefaultType, IsScalable, Predicate, PredicatePattern, PrefetchOp; + bool DefaultType, IsScalable, Predicate, PredicatePattern, PrefetchOp, + Svcount; unsigned Bitwidth, ElementBitwidth, NumVectors; public: SVEType() : SVEType(TypeSpec(), 'v') {} - SVEType(TypeSpec TS, char CharMod) + SVEType(TypeSpec TS, char CharMod, unsigned NumVectors = 1) : TS(TS), Float(false), Signed(true), Immediate(false), Void(false), Constant(false), Pointer(false), BFloat(false), DefaultType(false), IsScalable(true), Predicate(false), PredicatePattern(false), - PrefetchOp(false), Bitwidth(128), ElementBitwidth(~0U), NumVectors(1) { + PrefetchOp(false), Svcount(false), Bitwidth(128), ElementBitwidth(~0U), + NumVectors(NumVectors) { if (!TS.empty()) applyTypespec(); applyModifier(CharMod); @@ -95,13 +99,14 @@ bool isFloat() const { return Float && !BFloat; } bool isBFloat() const { return BFloat && !Float; } bool isFloatingPoint() const { return Float || BFloat; } - bool isInteger() const { return !isFloatingPoint() && !Predicate; } + bool isInteger() const { return !isFloatingPoint() && !Predicate && !Svcount; } bool isScalarPredicate() const { return !isFloatingPoint() && Predicate && NumVectors == 0; } bool isPredicateVector() const { return Predicate; } bool isPredicatePattern() const { return PredicatePattern; } bool isPrefetchOp() const { return PrefetchOp; } + bool isSvcount() const { return Svcount; } bool isConstant() const { return Constant; } unsigned getElementSizeInBits() const { return ElementBitwidth; } unsigned getNumVectors() const { return NumVectors; } @@ -189,7 +194,9 @@ SVEType getReturnType() const { return Types[0]; } ArrayRef getTypes() const { return Types; } SVEType getParamType(unsigned I) const { return Types[I + 1]; } - unsigned getNumParams() const { return Proto.size() - 1; } + unsigned getNumParams() const { + return Proto.size() - (2 * std::count(Proto.begin(), Proto.end(), '.')) - 1; + } uint64_t getFlags() const { return Flags; } bool isFlagSet(uint64_t Flag) const { return Flags & Flag;} @@ -203,6 +210,9 @@ /// ClassS, so will add type suffixes such as _u32/_s32. std::string getMangledName() const { return mangleName(ClassS); } + /// As above, but mangles the LLVM name instead. + std::string getMangledLLVMName() const { return mangleLLVMName(); } + /// Returns true if the intrinsic is overloaded, in that it should also generate /// a short form without the type-specifiers, e.g. 'svld1(..)' instead of /// 'svld1_u32(..)'. @@ -220,19 +230,28 @@ /// Return the parameter index of the splat operand. unsigned getSplatIdx() const { - // These prototype modifiers are described in arm_sve.td. - auto Idx = Proto.find_first_of("ajfrKLR@"); - assert(Idx != std::string::npos && Idx > 0 && - "Prototype has no splat operand"); - return Idx - 1; + unsigned I = 1, Param = 0; + for (; I < Proto.size(); ++I, ++Param) { + if (Proto[I] == 'a' || Proto[I] == 'j' || Proto[I] == 'f' || + Proto[I] == 'r' || Proto[I] == 'K' || Proto[I] == 'L' || + Proto[I] == 'R' || Proto[I] == '@') + break; + + // Multivector modifier can be skipped + if (Proto[I] == '.') + I += 2; + } + assert(I != Proto.size() && "Prototype has no splat operand"); + return Param; } /// Emits the intrinsic declaration to the ostream. - void emitIntrinsic(raw_ostream &OS) const; + void emitIntrinsic(raw_ostream &OS, ACLEKind Kind) const; private: std::string getMergeSuffix() const { return MergeSuffix; } std::string mangleName(ClassKind LocalCK) const; + std::string mangleLLVMName() const; std::string replaceTemplatedArgs(std::string Name, TypeSpec TS, std::string Proto) const; }; @@ -334,18 +353,33 @@ /// Emit arm_sve.h. void createHeader(raw_ostream &o); + // Emits core intrinsics in both arm_sme.h and arm_sve.h + void createCoreHeaderIntrinsics(raw_ostream &o, ACLEKind Kind); + /// Emit all the __builtin prototypes and code needed by Sema. - void createBuiltins(raw_ostream &o); + void createBuiltins(raw_ostream &o, ACLEKind Kind); /// Emit all the information needed to map builtin -> LLVM IR intrinsic. - void createCodeGenMap(raw_ostream &o); + void createCodeGenMap(raw_ostream &o, ACLEKind Kind); /// Emit all the range checks for the immediates. - void createRangeChecks(raw_ostream &o); + void createRangeChecks(raw_ostream &o, ACLEKind Kind); /// Create the SVETypeFlags used in CGBuiltins void createTypeFlags(raw_ostream &o); + /// Emit arm_sme.h. + void createSmeHeader(raw_ostream &o); + + /// Create the SMETypeFlags used in CGBuiltins + void createSmeTypeFlags(raw_ostream &o); + + /// Create a table for a builtin's requirement for PSTATE.SM. + void createStreamingAttrs(raw_ostream &o, ACLEKind Kind); + + /// Create a table for a builtin's requirement for PSTATE.ZA. + void createBuiltinZAState(raw_ostream &OS); + /// Create intrinsic and add it to \p Out void createIntrinsic(Record *R, SmallVectorImpl> &Out); }; @@ -365,6 +399,9 @@ if (isScalarPredicate()) return "b"; + if (isSvcount()) + return "Q"; + if (isVoidPointer()) S += "v"; else if (!isFloatingPoint()) @@ -425,16 +462,19 @@ return "enum svprfop"; std::string S; + if (Void) S += "void"; else { - if (isScalableVector()) + if (isScalableVector() || isSvcount()) S += "sv"; if (!Signed && !isFloatingPoint()) S += "u"; if (Float) S += "float"; + else if (isSvcount()) + S += "count"; else if (isScalarPredicate() || isPredicateVector()) S += "bool"; else if (isBFloat()) @@ -442,7 +482,7 @@ else S += "int"; - if (!isScalarPredicate() && !isPredicateVector()) + if (!isScalarPredicate() && !isPredicateVector() && !isSvcount()) S += utostr(ElementBitwidth); if (!isScalableVector() && isVector()) S += "x" + utostr(getNumElements()); @@ -462,6 +502,9 @@ void SVEType::applyTypespec() { for (char I : TS) { switch (I) { + case 'Q': + Svcount = true; + break; case 'P': Predicate = true; break; @@ -506,15 +549,6 @@ void SVEType::applyModifier(char Mod) { switch (Mod) { - case '2': - NumVectors = 2; - break; - case '3': - NumVectors = 3; - break; - case '4': - NumVectors = 4; - break; case 'v': Void = true; break; @@ -553,6 +587,7 @@ Float = false; BFloat = false; Predicate = true; + Svcount = false; Bitwidth = 16; ElementBitwidth = 1; break; @@ -592,18 +627,21 @@ break; case 'u': Predicate = false; + Svcount = false; Signed = false; Float = false; BFloat = false; break; case 'x': Predicate = false; + Svcount = false; Signed = true; Float = false; BFloat = false; break; case 'i': Predicate = false; + Svcount = false; Float = false; BFloat = false; ElementBitwidth = Bitwidth = 64; @@ -613,6 +651,7 @@ break; case 'I': Predicate = false; + Svcount = false; Float = false; BFloat = false; ElementBitwidth = Bitwidth = 32; @@ -623,6 +662,7 @@ break; case 'J': Predicate = false; + Svcount = false; Float = false; BFloat = false; ElementBitwidth = Bitwidth = 32; @@ -633,6 +673,7 @@ break; case 'k': Predicate = false; + Svcount = false; Signed = true; Float = false; BFloat = false; @@ -641,6 +682,7 @@ break; case 'l': Predicate = false; + Svcount = false; Signed = true; Float = false; BFloat = false; @@ -649,6 +691,7 @@ break; case 'm': Predicate = false; + Svcount = false; Signed = false; Float = false; BFloat = false; @@ -657,6 +700,7 @@ break; case 'n': Predicate = false; + Svcount = false; Signed = false; Float = false; BFloat = false; @@ -695,20 +739,30 @@ break; case 'O': Predicate = false; + Svcount = false; Float = true; ElementBitwidth = 16; break; case 'M': Predicate = false; + Svcount = false; Float = true; BFloat = false; ElementBitwidth = 32; break; case 'N': Predicate = false; + Svcount = false; Float = true; ElementBitwidth = 64; break; + case '$': + Predicate = false; + Svcount = false; + Float = false; + BFloat = true; + ElementBitwidth = 16; + break; case 'Q': Constant = true; Pointer = true; @@ -799,11 +853,56 @@ NumVectors = 0; Signed = false; break; + case '{': + Pointer = true; + Void = true; + NumVectors = 0; + break; + case '}': + Predicate = false; + Signed = true; + Svcount = true; + NumVectors = 0; + Float = false; + BFloat = false; + break; + case 'y': + Predicate = true; + Svcount = false; + NumVectors = 0; + Float = false; + BFloat = false; + break; + case '.' : + llvm_unreachable(". should never be a type"); + break; default: llvm_unreachable("Unhandled character!"); } } +/// Returns the modifier and number of vectors for the given operand \p Op. +std::pair getProtoModifier(StringRef Proto, unsigned Op) { + for (unsigned P = 0; !Proto.empty(); ++P) { + unsigned NumVectors = 1; + unsigned CharsToSkip = 1; + char Mod = Proto[0]; + if (Mod == '2' || Mod == '3' || Mod == '4') { + NumVectors = Mod - '0'; + Mod = 'd'; + if (Proto.size() > 1 && Proto[1] == '.') { + Mod = Proto[2]; + CharsToSkip = 3; + } + } + + if (P == Op) + return {Mod, NumVectors}; + + Proto = Proto.drop_front(CharsToSkip); + } + llvm_unreachable("Unexpected Op"); +} //===----------------------------------------------------------------------===// // Intrinsic implementation @@ -819,18 +918,21 @@ MergeSuffix(MergeSuffix.str()), BaseType(BT, 'd'), Flags(Flags), ImmChecks(Checks.begin(), Checks.end()) { // Types[0] is the return value. - for (unsigned I = 0; I < Proto.size(); ++I) { - SVEType T(BaseTypeSpec, Proto[I]); + for (unsigned I = 0; I < (getNumParams() + 1); ++I) { + char Mod; + unsigned NumVectors; + std::tie(Mod, NumVectors) = getProtoModifier(Proto, I); + SVEType T(BaseTypeSpec, Mod, NumVectors); Types.push_back(T); // Add range checks for immediates if (I > 0) { if (T.isPredicatePattern()) - ImmChecks.emplace_back( - I - 1, Emitter.getEnumValueForImmCheck("ImmCheck0_31")); + ImmChecks.emplace_back(I - 1, + Emitter.getEnumValueForImmCheck("ImmCheck0_31")); else if (T.isPrefetchOp()) - ImmChecks.emplace_back( - I - 1, Emitter.getEnumValueForImmCheck("ImmCheck0_13")); + ImmChecks.emplace_back(I - 1, + Emitter.getEnumValueForImmCheck("ImmCheck0_13")); } } @@ -879,6 +981,8 @@ std::string TypeCode; if (T.isInteger()) TypeCode = T.isSigned() ? 's' : 'u'; + else if (T.isSvcount()) + TypeCode = 'c'; else if (T.isPredicateVector()) TypeCode = 'b'; else if (T.isBFloat()) @@ -891,6 +995,13 @@ return Ret; } +std::string Intrinsic::mangleLLVMName() const { + std::string S = getLLVMName(); + + // Replace all {d} like expressions with e.g. 'u32' + return replaceTemplatedArgs(S, getBaseTypeSpec(), getProto()); +} + std::string Intrinsic::mangleName(ClassKind LocalCK) const { std::string S = getName(); @@ -918,15 +1029,23 @@ getMergeSuffix(); } -void Intrinsic::emitIntrinsic(raw_ostream &OS) const { +void Intrinsic::emitIntrinsic(raw_ostream &OS, ACLEKind Kind) const { bool IsOverloaded = getClassKind() == ClassG && getProto().size() > 1; std::string FullName = mangleName(ClassS); std::string ProtoName = mangleName(getClassKind()); OS << (IsOverloaded ? "__aio " : "__ai ") - << "__attribute__((__clang_arm_builtin_alias(" - << "__builtin_sve_" << FullName << ")))\n"; + << "__attribute__((__clang_arm_builtin_alias("; + + switch (Kind) { + case ACLEKind::SME: + OS << "__builtin_sme_" << FullName << ")))\n"; + break; + case ACLEKind::SVE: + OS << "__builtin_sve_" << FullName << ")))\n"; + break; + } OS << getTypes()[0].str() << " " << ProtoName << "("; for (unsigned I = 0; I < getTypes().size() - 1; ++I) { @@ -959,7 +1078,7 @@ return encodeEltType("EltTyBFloat16"); } - if (T.isPredicateVector()) { + if (T.isPredicateVector() || T.isSvcount()) { switch (T.getElementSizeInBits()) { case 8: return encodeEltType("EltTyBool8"); @@ -1039,10 +1158,12 @@ assert(Arg >= 0 && Kind >= 0 && "Arg and Kind must be nonnegative"); unsigned ElementSizeInBits = 0; + char Mod; + unsigned NumVectors; + std::tie(Mod, NumVectors) = getProtoModifier(Proto, EltSizeArg + 1); if (EltSizeArg >= 0) - ElementSizeInBits = - SVEType(TS, Proto[EltSizeArg + /* offset by return arg */ 1]) - .getElementSizeInBits(); + ElementSizeInBits = SVEType(TS, Mod, NumVectors).getElementSizeInBits(); + ImmChecks.push_back(ImmCheck(Arg, Kind, ElementSizeInBits)); } @@ -1058,6 +1179,31 @@ } } +void SVEEmitter::createCoreHeaderIntrinsics(raw_ostream &OS, ACLEKind Kind) { + SmallVector, 128> Defs; + std::vector RV = Records.getAllDerivedDefinitions("Inst"); + for (auto *R : RV) + createIntrinsic(R, Defs); + + // Sort intrinsics in header file by following order/priority: + // - Architectural guard (i.e. does it require SVE2 or SVE2_AES) + // - Class (is ntrinsic overloaded or not) + // - Intrinsic name + std::stable_sort(Defs.begin(), Defs.end(), + [](const std::unique_ptr &A, + const std::unique_ptr &B) { + auto ToTuple = [](const std::unique_ptr &I) { + return std::make_tuple(I->getGuard(), + (unsigned)I->getClassKind(), + I->getName()); + }; + return ToTuple(A) < ToTuple(B); + }); + + for (auto &I : Defs) + I->emitIntrinsic(OS, Kind); +} + void SVEEmitter::createHeader(raw_ostream &OS) { OS << "/*===---- arm_sve.h - ARM SVE intrinsics " "-----------------------------------===\n" @@ -1147,6 +1293,8 @@ OS << "typedef __clang_svbfloat16x3_t svbfloat16x3_t;\n"; OS << "typedef __clang_svbfloat16x4_t svbfloat16x4_t;\n"; + OS << "typedef __SVCount_t svcount_t;\n\n"; + OS << "enum svpattern\n"; OS << "{\n"; OS << " SV_POW2 = 0,\n"; @@ -1189,6 +1337,17 @@ "__nodebug__))\n\n"; OS << "#define __aio static __inline__ __attribute__((__always_inline__, " "__nodebug__, __overloadable__))\n\n"; + OS << "#ifdef __ARM_FEATURE_SME\n"; + OS << "#define __asc __attribute__((arm_streaming_compatible))\n"; + OS << "#else\n"; + OS << "#define __asc\n"; + OS << "#endif\n\n"; + + OS << "#if defined(__ARM_FEATURE_SME2)\n"; + OS << "#define __as __attribute__((arm_streaming))\n"; + OS << "#else\n"; + OS << "#define __as\n"; + OS << "#endif\n\n"; // Add reinterpret functions. for (auto ShortForm : { false, true } ) @@ -1207,27 +1366,7 @@ << To.Suffix << "(__VA_ARGS__)\n"; } - SmallVector, 128> Defs; - std::vector RV = Records.getAllDerivedDefinitions("Inst"); - for (auto *R : RV) - createIntrinsic(R, Defs); - - // Sort intrinsics in header file by following order/priority: - // - Architectural guard (i.e. does it require SVE2 or SVE2_AES) - // - Class (is intrinsic overloaded or not) - // - Intrinsic name - std::stable_sort( - Defs.begin(), Defs.end(), [](const std::unique_ptr &A, - const std::unique_ptr &B) { - auto ToTuple = [](const std::unique_ptr &I) { - return std::make_tuple(I->getGuard(), (unsigned)I->getClassKind(), I->getName()); - }; - return ToTuple(A) < ToTuple(B); - }); - - // Actually emit the intrinsic declarations. - for (auto &I : Defs) - I->emitIntrinsic(OS); + createCoreHeaderIntrinsics(OS, ACLEKind::SVE); OS << "#define svcvtnt_bf16_x svcvtnt_bf16_m\n"; OS << "#define svcvtnt_bf16_f32_x svcvtnt_bf16_f32_m\n"; @@ -1248,7 +1387,7 @@ OS << "#endif /* __ARM_SVE_H */\n"; } -void SVEEmitter::createBuiltins(raw_ostream &OS) { +void SVEEmitter::createBuiltins(raw_ostream &OS, ACLEKind Kind) { std::vector RV = Records.getAllDerivedDefinitions("Inst"); SmallVector, 128> Defs; for (auto *R : RV) @@ -1260,14 +1399,36 @@ return A->getMangledName() < B->getMangledName(); }); - OS << "#ifdef GET_SVE_BUILTINS\n"; + switch (Kind) { + case ACLEKind::SME: + OS << "#ifdef GET_SME_BUILTINS\n"; + break; + case ACLEKind::SVE: + OS << "#ifdef GET_SVE_BUILTINS\n"; + break; + } + for (auto &Def : Defs) { // Only create BUILTINs for non-overloaded intrinsics, as overloaded // declarations only live in the header file. - if (Def->getClassKind() != ClassG) - OS << "TARGET_BUILTIN(__builtin_sve_" << Def->getMangledName() << ", \"" + if (Def->getClassKind() != ClassG) { + switch (Kind) { + case ACLEKind::SME: + OS << "TARGET_BUILTIN(__builtin_sme_"; + break; + case ACLEKind::SVE: + OS << "TARGET_BUILTIN(__builtin_sve_"; + } + + OS << Def->getMangledName() << ", \"" << Def->getBuiltinTypeStr() << "\", \"n\", \"" << Def->getGuard() << "\")\n"; + } + } + + if (Kind == ACLEKind::SME) { + OS << "#endif\n\n"; + return; } // Add reinterpret builtins @@ -1280,7 +1441,7 @@ OS << "#endif\n\n"; } -void SVEEmitter::createCodeGenMap(raw_ostream &OS) { +void SVEEmitter::createCodeGenMap(raw_ostream &OS, ACLEKind Kind) { std::vector RV = Records.getAllDerivedDefinitions("Inst"); SmallVector, 128> Defs; for (auto *R : RV) @@ -1292,7 +1453,15 @@ return A->getMangledName() < B->getMangledName(); }); - OS << "#ifdef GET_SVE_LLVM_INTRINSIC_MAP\n"; + switch (Kind) { + case ACLEKind::SME: + OS << "#ifdef GET_SME_LLVM_INTRINSIC_MAP\n"; + break; + case ACLEKind::SVE: + OS << "#ifdef GET_SVE_LLVM_INTRINSIC_MAP\n"; + break; + } + for (auto &Def : Defs) { // Builtins only exist for non-overloaded intrinsics, overloaded // declarations only live in the header file. @@ -1302,18 +1471,26 @@ uint64_t Flags = Def->getFlags(); auto FlagString = std::to_string(Flags); - std::string LLVMName = Def->getLLVMName(); + std::string LLVMName = Def->getMangledLLVMName(); std::string Builtin = Def->getMangledName(); + switch (Kind) { + case ACLEKind::SME: + OS << "SME"; + break; + case ACLEKind::SVE: + OS << "SVE"; + break; + } if (!LLVMName.empty()) - OS << "SVEMAP1(" << Builtin << ", " << LLVMName << ", " << FlagString + OS << "MAP1(" << Builtin << ", " << LLVMName << ", " << FlagString << "),\n"; else - OS << "SVEMAP2(" << Builtin << ", " << FlagString << "),\n"; + OS << "MAP2(" << Builtin << ", " << FlagString << "),\n"; } OS << "#endif\n\n"; } -void SVEEmitter::createRangeChecks(raw_ostream &OS) { +void SVEEmitter::createRangeChecks(raw_ostream &OS, ACLEKind Kind) { std::vector RV = Records.getAllDerivedDefinitions("Inst"); SmallVector, 128> Defs; for (auto *R : RV) @@ -1325,8 +1502,14 @@ return A->getMangledName() < B->getMangledName(); }); - - OS << "#ifdef GET_SVE_IMMEDIATE_CHECK\n"; + switch (Kind) { + case ACLEKind::SME: + OS << "#ifdef GET_SME_IMMEDIATE_CHECK\n"; + break; + case ACLEKind::SVE: + OS << "#ifdef GET_SVE_IMMEDIATE_CHECK\n"; + break; + } // Ensure these are only emitted once. std::set Emitted; @@ -1336,7 +1519,15 @@ Def->getImmChecks().empty()) continue; - OS << "case SVE::BI__builtin_sve_" << Def->getMangledName() << ":\n"; + switch (Kind) { + case ACLEKind::SME: + OS << "case SME::BI__builtin_sme_"; + break; + case ACLEKind::SVE: + OS << "case SVE::BI__builtin_sve_"; + break; + } + OS << Def->getMangledName() << ":\n"; for (auto &Check : Def->getImmChecks()) OS << "ImmChecks.push_back(std::make_tuple(" << Check.getArg() << ", " << Check.getKind() << ", " << Check.getElementSizeInBits() << "));\n"; @@ -1376,25 +1567,224 @@ OS << "#endif\n\n"; } +void SVEEmitter::createSmeHeader(raw_ostream &OS) { + OS << "/*===---- arm_sme.h - ARM SME intrinsics " + "-----------------------------------===\n" + " *\n" + " *\n" + " * Part of the LLVM Project, under the Apache License v2.0 with LLVM " + "Exceptions.\n" + " * See https://llvm.org/LICENSE.txt for license information.\n" + " * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception\n" + " *\n" + " *===-----------------------------------------------------------------" + "------===\n" + " */\n\n"; + + OS << "#ifndef __ARM_SME_H\n"; + OS << "#define __ARM_SME_H\n\n"; + + OS << "#if !defined(__LITTLE_ENDIAN__)\n"; + OS << "#error \"Big endian is currently not supported for arm_sme.h\"\n"; + OS << "#endif\n"; + + OS << "#include \n\n"; + OS << "#include \n\n"; + OS << "#ifdef __cplusplus\n"; + OS << "extern \"C\" {\n"; + OS << "#else\n"; + OS << "#include \n"; + OS << "#endif\n\n"; + + OS << "#include \"arm_sve.h\"\n\n"; + + OS << "/* Function attributes */\n"; + OS << "#define __ai static __inline__ __attribute__((__always_inline__, " + "__nodebug__))\n\n"; + OS << "#define __aio static __inline__ __attribute__((__always_inline__, " + "__nodebug__, __overloadable__))\n\n"; + OS << "#define __as __attribute__((arm_streaming))\n\n"; + OS << "#define __asc __attribute__((arm_streaming_compatible))\n\n"; + OS << "#define __asza __attribute__((arm_shared_za))\n\n"; + OS << "#define __apza __attribute__((arm_preserves_za))\n\n"; + + OS << "__asc void __arm_za_disable(void);\n"; + OS << "__ai __asc bool __arm_has_sme(void) {\n"; + OS << " uint64_t x0, x1;\n"; + OS << " __builtin_arm_get_sme_state(&x0, &x1);\n"; + OS << " return x0 & (1ULL << 63);\n"; + OS << "}\n"; + + OS << "__ai __asc bool __arm_in_streaming_mode(void) {\n"; + OS << " uint64_t x0, x1;\n"; + OS << " __builtin_arm_get_sme_state(&x0, &x1);\n"; + OS << " return x0 & 1;\n"; + OS << "}\n"; + OS << "__ai __asc __asza void svundef_za(void) { }\n\n"; + + createCoreHeaderIntrinsics(OS, ACLEKind::SME); + + OS << "__ai __asc __asza void svzero_za(void) { svzero_mask_za(255); }\n\n"; + + OS << "__asc __apza void *__arm_sc_memcpy(void *dest, const void *src, size_t n);\n"; + OS << "__asc __apza void *__arm_sc_memmove(void *dest, const void *src, size_t n);\n"; + OS << "__asc __apza void *__arm_sc_memset(void *s, int c, size_t n);\n"; + OS << "__asc __apza void *__arm_sc_memchr(void *s, int c, size_t n);\n\n"; + + OS << "#ifdef __cplusplus\n"; + OS << "} // extern \"C\"\n"; + OS << "#endif\n\n"; + OS << "#endif /* __ARM_SME_H */\n"; +} + +/// Create the SMETypeFlags used in CGBuiltins +void SVEEmitter::createSmeTypeFlags(raw_ostream &OS) { + OS << "#ifdef LLVM_GET_SME_IMMCHECKTYPES\n"; + for (auto &KV : ImmCheckTypes) + OS << " " << KV.getKey() << " = " << KV.getValue() << ",\n"; + OS << "#endif\n\n"; +} + +void SVEEmitter::createBuiltinZAState(raw_ostream &OS) { + std::vector RV = Records.getAllDerivedDefinitions("Inst"); + SmallVector, 128> Defs; + for (auto *R : RV) + createIntrinsic(R, Defs); + + // The mappings must be sorted based on BuiltinID. + llvm::sort(Defs, [](const std::unique_ptr &A, + const std::unique_ptr &B) { + return A->getMangledName() < B->getMangledName(); + }); + + OS << "#ifdef GET_SME_BUILTIN_HAS_ZA_STATE\n"; + + // Ensure these are only emitted once. + std::set Emitted; + + uint64_t IsSharedZAFlag = getEnumValueForFlag("IsSharedZA"); + for (auto &Def : Defs) { + if (Emitted.find(Def->getMangledName()) != Emitted.end()) + continue; + + OS << "case SME::BI__builtin_sme_" << Def->getMangledName() << ":\n"; + if (Def->isFlagSet(IsSharedZAFlag)) + OS << " return true;\n"; + else + OS << " return false;\n"; + + Emitted.insert(Def->getMangledName()); + } + + OS << "#endif\n\n"; +} + +void SVEEmitter::createStreamingAttrs(raw_ostream &OS, ACLEKind Kind) { + std::vector RV = Records.getAllDerivedDefinitions("Inst"); + SmallVector, 128> Defs; + for (auto *R : RV) + createIntrinsic(R, Defs); + + // The mappings must be sorted based on BuiltinID. + llvm::sort(Defs, [](const std::unique_ptr &A, + const std::unique_ptr &B) { + return A->getMangledName() < B->getMangledName(); + }); + + switch (Kind) { + case ACLEKind::SME: + OS << "#ifdef GET_SME_STREAMING_ATTRS\n"; + break; + case ACLEKind::SVE: + OS << "#ifdef GET_SVE_STREAMING_ATTRS\n"; + break; + } + + // Ensure these are only emitted once. + std::set Emitted; + + uint64_t IsStreamingFlag = getEnumValueForFlag("IsStreaming"); + uint64_t IsStreamingCompatibleFlag = + getEnumValueForFlag("IsStreamingCompatible"); + for (auto &Def : Defs) { + if (Emitted.find(Def->getMangledName()) != Emitted.end()) + continue; + + switch (Kind) { + case ACLEKind::SME: + OS << "case SME::BI__builtin_sme_"; + break; + case ACLEKind::SVE: + OS << "case SVE::BI__builtin_sve_"; + break; + } + OS << Def->getMangledName() << ":\n"; + + if (Def->isFlagSet(IsStreamingFlag)) + OS << " BuiltinType = ArmStreaming;\n"; + else if (Def->isFlagSet(IsStreamingCompatibleFlag)) + OS << " BuiltinType = ArmStreamingCompatible;\n"; + else + OS << " BuiltinType = ArmNonStreaming;\n"; + OS << " break;\n"; + + Emitted.insert(Def->getMangledName()); + } + + OS << "#endif\n\n"; +} + namespace clang { void EmitSveHeader(RecordKeeper &Records, raw_ostream &OS) { SVEEmitter(Records).createHeader(OS); } void EmitSveBuiltins(RecordKeeper &Records, raw_ostream &OS) { - SVEEmitter(Records).createBuiltins(OS); + SVEEmitter(Records).createBuiltins(OS, ACLEKind::SVE); } void EmitSveBuiltinCG(RecordKeeper &Records, raw_ostream &OS) { - SVEEmitter(Records).createCodeGenMap(OS); + SVEEmitter(Records).createCodeGenMap(OS, ACLEKind::SVE); } void EmitSveRangeChecks(RecordKeeper &Records, raw_ostream &OS) { - SVEEmitter(Records).createRangeChecks(OS); + SVEEmitter(Records).createRangeChecks(OS, ACLEKind::SVE); } void EmitSveTypeFlags(RecordKeeper &Records, raw_ostream &OS) { SVEEmitter(Records).createTypeFlags(OS); } +void EmitSveStreamingAttrs(RecordKeeper &Records, raw_ostream &OS) { + SVEEmitter(Records).createStreamingAttrs(OS, ACLEKind::SVE); +} + +void EmitSmeHeader(RecordKeeper &Records, raw_ostream &OS) { + SVEEmitter(Records).createSmeHeader(OS); +} + +void EmitSmeBuiltins(RecordKeeper &Records, raw_ostream &OS) { + SVEEmitter(Records).createBuiltins(OS, ACLEKind::SME); +} + +void EmitSmeBuiltinCG(RecordKeeper &Records, raw_ostream &OS) { + SVEEmitter(Records).createCodeGenMap(OS, ACLEKind::SME); +} + +void EmitSmeRangeChecks(RecordKeeper &Records, raw_ostream &OS) { + SVEEmitter(Records).createRangeChecks(OS, ACLEKind::SME); +} + +void EmitSmeTypeFlags(RecordKeeper &Records, raw_ostream &OS) { + SVEEmitter(Records).createSmeTypeFlags(OS); +} + +void EmitSmeStreamingAttrs(RecordKeeper &Records, raw_ostream &OS) { + SVEEmitter(Records).createStreamingAttrs(OS, ACLEKind::SME); +} + +void EmitSmeBuiltinZAState(RecordKeeper &Records, raw_ostream &OS) { + SVEEmitter(Records).createBuiltinZAState(OS); +} + } // End namespace clang diff --git a/clang/utils/TableGen/TableGen.cpp b/clang/utils/TableGen/TableGen.cpp --- a/clang/utils/TableGen/TableGen.cpp +++ b/clang/utils/TableGen/TableGen.cpp @@ -82,6 +82,14 @@ GenArmSveBuiltinCG, GenArmSveTypeFlags, GenArmSveRangeChecks, + GenArmSveStreamingAttrs, + GenArmSmeBuiltinZAState, + GenArmSmeHeader, + GenArmSmeBuiltins, + GenArmSmeBuiltinCG, + GenArmSmeTypeFlags, + GenArmSmeRangeChecks, + GenArmSmeStreamingAttrs, GenArmCdeHeader, GenArmCdeBuiltinDef, GenArmCdeBuiltinSema, @@ -226,6 +234,22 @@ "Generate arm_sve_typeflags.inc for clang"), clEnumValN(GenArmSveRangeChecks, "gen-arm-sve-sema-rangechecks", "Generate arm_sve_sema_rangechecks.inc for clang"), + clEnumValN(GenArmSveStreamingAttrs, "gen-arm-sve-streaming-attrs", + "Generate arm_sve_streaming_attrs.inc for clang"), + clEnumValN(GenArmSmeHeader, "gen-arm-sme-header", + "Generate arm_sme.h for clang"), + clEnumValN(GenArmSmeBuiltins, "gen-arm-sme-builtins", + "Generate arm_sme_builtins.inc for clang"), + clEnumValN(GenArmSmeBuiltinCG, "gen-arm-sme-builtin-codegen", + "Generate arm_sme_builtin_cg_map.inc for clang"), + clEnumValN(GenArmSmeTypeFlags, "gen-arm-sme-typeflags", + "Generate arm_sme_typeflags.inc for clang"), + clEnumValN(GenArmSmeRangeChecks, "gen-arm-sme-sema-rangechecks", + "Generate arm_sme_sema_rangechecks.inc for clang"), + clEnumValN(GenArmSmeStreamingAttrs, "gen-arm-sme-streaming-attrs", + "Generate arm_sme_streaming_attrs.inc for clang"), + clEnumValN(GenArmSmeBuiltinZAState, "gen-arm-sme-builtin-za-state", + "Generate arm_sme_builtins_za_state.inc for clang"), clEnumValN(GenArmMveHeader, "gen-arm-mve-header", "Generate arm_mve.h for clang"), clEnumValN(GenArmMveBuiltinDef, "gen-arm-mve-builtin-def", @@ -454,6 +478,30 @@ case GenArmSveRangeChecks: EmitSveRangeChecks(Records, OS); break; + case GenArmSveStreamingAttrs: + EmitSveStreamingAttrs(Records, OS); + break; + case GenArmSmeHeader: + EmitSmeHeader(Records, OS); + break; + case GenArmSmeBuiltins: + EmitSmeBuiltins(Records, OS); + break; + case GenArmSmeBuiltinCG: + EmitSmeBuiltinCG(Records, OS); + break; + case GenArmSmeTypeFlags: + EmitSmeTypeFlags(Records, OS); + break; + case GenArmSmeRangeChecks: + EmitSmeRangeChecks(Records, OS); + break; + case GenArmSmeStreamingAttrs: + EmitSmeStreamingAttrs(Records, OS); + break; + case GenArmSmeBuiltinZAState: + EmitSmeBuiltinZAState(Records, OS); + break; case GenArmCdeHeader: EmitCdeHeader(Records, OS); break; diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h --- a/clang/utils/TableGen/TableGenBackends.h +++ b/clang/utils/TableGen/TableGenBackends.h @@ -100,6 +100,15 @@ void EmitSveBuiltinCG(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitSveTypeFlags(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitSveRangeChecks(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); +void EmitSveStreamingAttrs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); + +void EmitSmeHeader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); +void EmitSmeBuiltins(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); +void EmitSmeBuiltinCG(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); +void EmitSmeTypeFlags(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); +void EmitSmeRangeChecks(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); +void EmitSmeStreamingAttrs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); +void EmitSmeBuiltinZAState(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitMveHeader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitMveBuiltinDef(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -974,11 +974,13 @@ } /// Create a call to the vector.extract intrinsic. - CallInst *CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, - const Twine &Name = "") { + Value *CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, + const Twine &Name = "") { + if (cast(Idx)->isZero() && DstType == SrcVec->getType()) + return SrcVec; return CreateIntrinsic(Intrinsic::vector_extract, - {DstType, SrcVec->getType()}, {SrcVec, Idx}, nullptr, - Name); + {DstType, SrcVec->getType()}, {SrcVec, Idx}, + nullptr, Name); } /// Create a call to the vector.insert intrinsic. diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h --- a/llvm/include/llvm/IR/Intrinsics.h +++ b/llvm/include/llvm/IR/Intrinsics.h @@ -138,6 +138,7 @@ AMX, PPCQuad, AnyPtrToElt, + AArch64Svcount, } Kind; union { diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -316,6 +316,7 @@ def IIT_ANYPTR_TO_ELT : IIT_Base<56>; def IIT_I2 : IIT_Int<2, 57>; def IIT_I4 : IIT_Int<4, 58>; +def IIT_AARCH64_SVCOUNT : IIT_VT; } defvar IIT_all_FixedTypes = !filter(iit, IIT_all, @@ -511,6 +512,8 @@ def llvm_x86mmx_ty : LLVMType; def llvm_ptrx86mmx_ty : LLVMPointerType; // <1 x i64>* +def llvm_aarch64_svcount_ty : LLVMType; + def llvm_x86amx_ty : LLVMType; def llvm_v2i1_ty : LLVMType; // 2 x i1 diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -2105,12 +2105,12 @@ // Reinterpreting data // -def int_aarch64_sve_convert_from_svbool : DefaultAttrsIntrinsic<[llvm_anyvector_ty], +def int_aarch64_sve_convert_from_svbool : DefaultAttrsIntrinsic<[llvm_any_ty], [llvm_nxv16i1_ty], [IntrNoMem]>; def int_aarch64_sve_convert_to_svbool : DefaultAttrsIntrinsic<[llvm_nxv16i1_ty], - [llvm_anyvector_ty], + [llvm_any_ty], [IntrNoMem]>; // @@ -2598,6 +2598,46 @@ def int_aarch64_sve_bfdot_lane_v2 : SVE_4Vec_BF16_Indexed; def int_aarch64_sve_bfmlalb_lane_v2 : SVE_4Vec_BF16_Indexed; def int_aarch64_sve_bfmlalt_lane_v2 : SVE_4Vec_BF16_Indexed; + +// +// SVE2.1 - Contiguous loads to multiple consecutive vectors +// + + class SVE2p1_Load_PN_VG2_Intrinsic + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], + [llvm_aarch64_svcount_ty, llvm_ptr_ty], + [IntrReadMem, IntrArgMemOnly]>; + + class SVE2p1_Load_PN_VG4_Intrinsic + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>], + [llvm_aarch64_svcount_ty, llvm_ptr_ty], + [IntrReadMem, IntrArgMemOnly]>; + +def int_aarch64_sve_ld1_pn_vg2 : SVE2p1_Load_PN_VG2_Intrinsic; +def int_aarch64_sve_ld1_pn_vg4 : SVE2p1_Load_PN_VG4_Intrinsic; +def int_aarch64_sve_ldnt1_pn_vg2 : SVE2p1_Load_PN_VG2_Intrinsic; +def int_aarch64_sve_ldnt1_pn_vg4 : SVE2p1_Load_PN_VG4_Intrinsic; + +// +// SVE2.1 - Contiguous stores to multiple consecutive vectors +// + + class SVE2p1_Store_PN_VG2_Intrinsic + : DefaultAttrsIntrinsic<[], [ llvm_anyvector_ty, LLVMMatchType<0>, + llvm_aarch64_svcount_ty, llvm_ptr_ty ], + [IntrWriteMem, IntrArgMemOnly]>; + + class SVE2p1_Store_PN_VG4_Intrinsic + : DefaultAttrsIntrinsic<[], [ llvm_anyvector_ty, LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>, + llvm_aarch64_svcount_ty, llvm_ptr_ty], + [IntrWriteMem, IntrArgMemOnly]>; + +def int_aarch64_sve_st1_pn_vg2 : SVE2p1_Store_PN_VG2_Intrinsic; +def int_aarch64_sve_st1_pn_vg4 : SVE2p1_Store_PN_VG4_Intrinsic; +def int_aarch64_sve_stnt1_pn_vg2 : SVE2p1_Store_PN_VG2_Intrinsic; +def int_aarch64_sve_stnt1_pn_vg4 : SVE2p1_Store_PN_VG4_Intrinsic; } // @@ -2644,10 +2684,11 @@ def int_aarch64_sme_st1q_vert : SME_Load_Store_Intrinsic; // Spill + fill - def int_aarch64_sme_ldr : DefaultAttrsIntrinsic< - [], [llvm_i32_ty, llvm_ptr_ty]>; - def int_aarch64_sme_str : DefaultAttrsIntrinsic< - [], [llvm_i32_ty, llvm_ptr_ty]>; + class SME_LDR_STR_Intrinsic + : DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_ptr_ty]>; + + def int_aarch64_sme_ldr : SME_LDR_STR_Intrinsic; + def int_aarch64_sme_str : SME_LDR_STR_Intrinsic; class SME_TileToVector_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyvector_ty], @@ -2715,6 +2756,8 @@ def int_aarch64_sme_cntsw : AdvSIMD_SME_CNTSB_Intrinsic; def int_aarch64_sme_cntsd : AdvSIMD_SME_CNTSB_Intrinsic; + def int_aarch64_sme_get_live_za_slices : AdvSIMD_SME_CNTSB_Intrinsic; + // // PSTATE Functions // @@ -2726,6 +2769,10 @@ : DefaultAttrsIntrinsic<[], [llvm_i64_ty], [IntrNoMem, IntrHasSideEffects]>; + def int_aarch64_sme_invoke_resume_pstatesm + : DefaultAttrsIntrinsic<[], [llvm_i64_ty], [IntrHasSideEffects]>; + + // def int_aarch64_sme_za_enable : DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; def int_aarch64_sme_za_disable @@ -2750,9 +2797,9 @@ // def int_aarch64_sve_psel - : DefaultAttrsIntrinsic<[llvm_anyvector_ty], - [LLVMMatchType<0>, - LLVMMatchType<0>, llvm_i32_ty], + : DefaultAttrsIntrinsic<[llvm_nxv16i1_ty], + [llvm_nxv16i1_ty, + llvm_anyvector_ty, llvm_i32_ty], [IntrNoMem]>; // @@ -2764,6 +2811,52 @@ [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>; } + // + // Predicate-as-counter intrinsics + // + + def int_aarch64_sve_pext + : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [llvm_aarch64_svcount_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + + def int_aarch64_sve_pext_x2 + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], + [llvm_aarch64_svcount_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + + def int_aarch64_sve_ptrue_c8 + : DefaultAttrsIntrinsic<[llvm_aarch64_svcount_ty], [], [IntrNoMem]>; + def int_aarch64_sve_ptrue_c16 + : DefaultAttrsIntrinsic<[llvm_aarch64_svcount_ty], [], [IntrNoMem]>; + def int_aarch64_sve_ptrue_c32 + : DefaultAttrsIntrinsic<[llvm_aarch64_svcount_ty], [], [IntrNoMem]>; + def int_aarch64_sve_ptrue_c64 + : DefaultAttrsIntrinsic<[llvm_aarch64_svcount_ty], [], [IntrNoMem]>; + + def int_aarch64_sve_cntp_c8 + : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_aarch64_svcount_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + def int_aarch64_sve_cntp_c16 + : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_aarch64_svcount_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + def int_aarch64_sve_cntp_c32 + : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_aarch64_svcount_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + def int_aarch64_sve_cntp_c64 + : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_aarch64_svcount_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + + // While (predicate-as-counter) intrinsics + foreach cmp = ["ge", "gt", "hi", "hs", "le", "lo", "ls", "lt"] in { + foreach ty = ["c8", "c16", "c32", "c64"] in { + def int_aarch64_sve_while # cmp # _ # ty + : DefaultAttrsIntrinsic<[llvm_aarch64_svcount_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + } + } + // // SME2 Intrinsics // @@ -2878,6 +2971,21 @@ LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + class SVE2_VG2_Sel_Intrinsic + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], + [llvm_aarch64_svcount_ty, LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>, + LLVMMatchType<0>], [IntrNoMem]>; + + class SVE2_VG4_Sel_Intrinsic + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>], + [llvm_aarch64_svcount_ty, LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>, + LLVMMatchType<0>], [IntrNoMem]>; + class SME2_CVT_VG2_SINGLE_Intrinsic : DefaultAttrsIntrinsic<[LLVMSubdivide2VectorType<0>], [llvm_anyvector_ty, LLVMMatchType<0>], @@ -2915,6 +3023,16 @@ [llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + class SME2_BFMLS_Intrinsic + : DefaultAttrsIntrinsic<[llvm_nxv4f32_ty], + [llvm_nxv4f32_ty, llvm_nxv8bf16_ty, llvm_nxv8bf16_ty], + [IntrNoMem]>; + + class SME2_BFMLS_Lane_Intrinsic + : DefaultAttrsIntrinsic<[llvm_nxv4f32_ty], + [llvm_nxv4f32_ty, llvm_nxv8bf16_ty, llvm_nxv8bf16_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + class SME2_ZA_ArrayVector_Read_VG2_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [llvm_i32_ty], @@ -3118,6 +3236,11 @@ def int_aarch64_sme_usmla_za32_lane_vg4x2 : SME2_Matrix_ArrayVector_VG2_Multi_Index_Intrinsic; def int_aarch64_sme_usmla_za32_lane_vg4x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic; + def int_aarch64_sve_bfmlslb : SME2_BFMLS_Intrinsic; + def int_aarch64_sve_bfmlslb_lane : SME2_BFMLS_Lane_Intrinsic; + + def int_aarch64_sve_bfmlslt : SME2_BFMLS_Intrinsic; + def int_aarch64_sve_bfmlslt_lane : SME2_BFMLS_Lane_Intrinsic; // Multi-vector signed saturating doubling multiply high def int_aarch64_sve_sqdmulh_single_vgx2 : SME2_VG2_Multi_Single_Intrinsic; @@ -3137,7 +3260,7 @@ // Multi-vector min/max // - foreach ty = ["f", "s", "u"] in { + foreach ty = ["bf", "f", "s", "u"] in { foreach instr = ["max", "min"] in { def int_aarch64_sve_ # ty # instr # _single_x2 : SME2_VG2_Multi_Single_Intrinsic; def int_aarch64_sve_ # ty # instr # _single_x4 : SME2_VG4_Multi_Single_Intrinsic; @@ -3163,9 +3286,7 @@ // Multi-vector vertical dot-products // - def int_aarch64_sme_fvdot_lane_za32_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Index_Intrinsic; - - foreach ty = ["s", "u"] in { + foreach ty = ["f", "s", "u"] in { def int_aarch64_sme_ #ty # vdot_lane_za32_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Index_Intrinsic; def int_aarch64_sme_ #ty # vdot_lane_za32_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic; def int_aarch64_sme_ #ty # vdot_lane_za64_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic; @@ -3337,4 +3458,53 @@ def int_aarch64_sve_uunpk_x2 : SME2_VG2_Unpk_Intrinsic; def int_aarch64_sve_sunpk_x4 : SME2_VG4_Unpk_Intrinsic; def int_aarch64_sve_uunpk_x4 : SME2_VG4_Unpk_Intrinsic; + + // 2-way and 4-way vector selects + def int_aarch64_sve_sel_x2 : SVE2_VG2_Sel_Intrinsic; + def int_aarch64_sve_sel_x4 : SVE2_VG4_Sel_Intrinsic; + + // + // Spill and fill of ZT0 + // + + def int_aarch64_sme_ldr_zt : SME_LDR_STR_Intrinsic; + def int_aarch64_sme_str_zt : SME_LDR_STR_Intrinsic; + + // + // Zero ZT0 + // + + def int_aarch64_sme_zero_zt : DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg>, IntrWriteMem]>; + + // + // Lookup table expand one register + // + def int_aarch64_sme_luti2_lane_zt + : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_i32_ty, LLVMMatchType<0>, llvm_i32_ty], + [ImmArg>, ImmArg>, IntrReadMem]>; + def int_aarch64_sme_luti4_lane_zt + : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_i32_ty, LLVMMatchType<0>, llvm_i32_ty], + [ImmArg>, ImmArg>, IntrReadMem]>; + + // + // Lookup table expand two registers + // + def int_aarch64_sme_luti2_lane_zt_x2 + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [llvm_i32_ty, LLVMMatchType<0>, llvm_i32_ty], + [ImmArg>, ImmArg>, IntrReadMem]>; + def int_aarch64_sme_luti4_lane_zt_x2 + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [llvm_i32_ty, LLVMMatchType<0>, llvm_i32_ty], + [ImmArg>, ImmArg>, IntrReadMem]>; + + // + // Lookup table expand four registers + // + def int_aarch64_sme_luti2_lane_zt_x4 + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], + [llvm_i32_ty, LLVMMatchType<0>, llvm_i32_ty], + [ImmArg>, ImmArg>, IntrReadMem]>; + def int_aarch64_sme_luti4_lane_zt_x4 + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], + [llvm_i32_ty, LLVMMatchType<0>, llvm_i32_ty], + [ImmArg>, ImmArg>, IntrReadMem]>; } diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -1099,6 +1099,9 @@ case IIT_I4: OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 4)); return; + case IIT_AARCH64_SVCOUNT: + OutputTable.push_back(IITDescriptor::get(IITDescriptor::AArch64Svcount, 0)); + return; case IIT_I8: OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 8)); return; @@ -1340,6 +1343,8 @@ case IITDescriptor::Double: return Type::getDoubleTy(Context); case IITDescriptor::Quad: return Type::getFP128Ty(Context); case IITDescriptor::PPCQuad: return Type::getPPC_FP128Ty(Context); + case IITDescriptor::AArch64Svcount: + return TargetExtType::get(Context, "aarch64.svcount"); case IITDescriptor::Integer: return IntegerType::get(Context, D.Integer_Width); @@ -1514,6 +1519,9 @@ case IITDescriptor::Quad: return !Ty->isFP128Ty(); case IITDescriptor::PPCQuad: return !Ty->isPPC_FP128Ty(); case IITDescriptor::Integer: return !Ty->isIntegerTy(D.Integer_Width); + case IITDescriptor::AArch64Svcount: + return !isa(Ty) || + cast(Ty)->getName() != "aarch64.svcount"; case IITDescriptor::Vector: { VectorType *VT = dyn_cast(Ty); return !VT || VT->getElementCount() != D.Vector_Width || diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h --- a/llvm/lib/Target/AArch64/AArch64.h +++ b/llvm/lib/Target/AArch64/AArch64.h @@ -54,6 +54,7 @@ FunctionPass *createFalkorMarkStridedAccessesPass(); FunctionPass *createAArch64BranchTargetsPass(); FunctionPass *createAArch64MIPeepholeOptPass(); +FunctionPass *createSMEPeepholeOptPass(); FunctionPass *createAArch64CleanupLocalDynamicTLSPass(); @@ -90,6 +91,8 @@ void initializeAArch64LoadStoreOptPass(PassRegistry&); void initializeAArch64LowerHomogeneousPrologEpilogPass(PassRegistry &); void initializeAArch64MIPeepholeOptPass(PassRegistry &); +void initializeSMEPeepholeOptPass(PassRegistry &); +void initializeAArch64SIMDInstrOptPass(PassRegistry&); void initializeAArch64O0PreLegalizerCombinerPass(PassRegistry &); void initializeAArch64PostLegalizerCombinerPass(PassRegistry &); void initializeAArch64PostLegalizerLoweringPass(PassRegistry &); diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -109,6 +109,9 @@ def FeatureFP16FML : SubtargetFeature<"fp16fml", "HasFP16FML", "true", "Enable FP16 FML instructions (FEAT_FHM)", [FeatureFullFP16]>; +def FeatureBF16 : SubtargetFeature<"bf16", "HasBF16", + "true", "Enable BFloat16 Extension (FEAT_BF16)" >; + def FeatureSPE : SubtargetFeature<"spe", "HasSPE", "true", "Enable Statistical Profiling extension (FEAT_SPE)">; @@ -166,7 +169,7 @@ "Enable bit permutation SVE2 instructions (FEAT_SVE_BitPerm)", [FeatureSVE2]>; def FeatureSVE2p1: SubtargetFeature<"sve2p1", "HasSVE2p1", "true", - "Enable Scalable Vector Extension 2.1 instructions", [FeatureSVE2]>; + "Enable Scalable Vector Extension 2.1 instructions", [FeatureSVE2, FeatureBF16]>; def FeatureB16B16 : SubtargetFeature<"b16b16", "HasB16B16", "true", "Enable SVE2.1 or SME2.1 non-widening BFloat16 to BFloat16 instructions (FEAT_B16B16)", []>; @@ -435,9 +438,6 @@ "true", "Use an instruction sequence for taking the address of a global " "that allows a memory tag in the upper address bits">; -def FeatureBF16 : SubtargetFeature<"bf16", "HasBF16", - "true", "Enable BFloat16 Extension (FEAT_BF16)" >; - def FeatureMatMulInt8 : SubtargetFeature<"i8mm", "HasMatMulInt8", "true", "Enable Matrix Multiply Int8 Extension (FEAT_I8MM)">; diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -968,10 +968,14 @@ } void AArch64AsmPrinter::emitFunctionEntryLabel() { + SMEAttrs Attrs(MF->getFunction()); if (MF->getFunction().getCallingConv() == CallingConv::AArch64_VectorCall || MF->getFunction().getCallingConv() == CallingConv::AArch64_SVE_VectorCall || - MF->getInfo()->isSVECC()) { + MF->getInfo()->isSVECC() || + Attrs.hasStreamingInterface() || + Attrs.hasStreamingCompatibleInterface() || + Attrs.hasSharedZAInterface()) { auto *TS = static_cast(OutStreamer->getTargetStreamer()); TS->emitDirectiveVariantPCS(CurrentFnSym); diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp --- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp +++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp @@ -5186,8 +5186,10 @@ FastISel *AArch64::createFastISel(FunctionLoweringInfo &FuncInfo, const TargetLibraryInfo *LibInfo) { - SMEAttrs CallerAttrs(*FuncInfo.Fn); - if (CallerAttrs.hasZAState() || + const AArch64Subtarget *Subtarget = + &FuncInfo.MF->getSubtarget(); + SMEAttrs CallerAttrs(*FuncInfo.Fn, Subtarget->hasSME2()); + if (CallerAttrs.hasZAState() || CallerAttrs.hasZTState() || (!CallerAttrs.hasStreamingInterface() && CallerAttrs.hasStreamingBody())) return nullptr; return new AArch64FastISel(FuncInfo, LibInfo); diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -326,9 +326,13 @@ return false; } - template bool ImmToTile(SDValue N, SDValue &Imm) { + template bool ImmToTile(SDValue N, SDValue &Imm) { if (auto *CI = dyn_cast(N)) { uint64_t C = CI->getZExtValue(); + + if (C > Max) + return false; + Imm = CurDAG->getRegister(BaseReg + C, MVT::Other); return true; } @@ -370,8 +374,12 @@ void SelectPredicatedLoad(SDNode *N, unsigned NumVecs, unsigned Scale, unsigned Opc_rr, unsigned Opc_ri, bool IsIntr = false); + void SelectContiguousMultiVectorLoad(SDNode *N, unsigned NumVecs, + unsigned Scale, unsigned Opc_ri); void SelectDestructiveMultiIntrinsic(SDNode *N, unsigned NumVecs, - bool IsZmMulti, unsigned Opcode); + bool IsZmMulti, unsigned Opcode, + bool HasPred = false); + void SelectPExtPair(SDNode *N, unsigned Opc); void SelectWhilePair(SDNode *N, unsigned Opc); void SelectCVTIntrinsic(SDNode *N, unsigned NumVecs, unsigned Opcode); void SelectClamp(SDNode *N, unsigned NumVecs, unsigned Opcode); @@ -382,6 +390,8 @@ template void SelectMultiVectorMove(SDNode *N, unsigned NumVecs, unsigned BaseReg, unsigned Op); + template + void SelectMultiVectorLuti(SDNode *Node, unsigned NumOutVecs, unsigned Opc); bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm); /// SVE Reg+Imm addressing mode. @@ -1597,10 +1607,11 @@ } enum class SelectTypeKind { - Int1 = 0, + AnyType = 0, Int = 1, FP = 2, - AnyType = 3, + Int1 = 3, + BF16 = 4 }; /// This function selects an opcode from a list of opcodes, which is @@ -1629,6 +1640,10 @@ if (EltVT != MVT::f16 && EltVT != MVT::f32 && EltVT != MVT::f64) return 0; break; + case SelectTypeKind::BF16: + if (EltVT != MVT::bf16) + return 0; + break; } unsigned Offset; @@ -1652,6 +1667,28 @@ return (Opcodes.size() <= Offset) ? 0 : Opcodes[Offset]; } +// This function is almost identical to SelectWhilePair, but has an +// extra check on the range of the immediate operand. +// TODO: Merge these two functions together at some point? +void AArch64DAGToDAGISel::SelectPExtPair(SDNode *N, unsigned Opc) { + // Immediate can be either 0 or 1. + if (ConstantSDNode *Imm = dyn_cast(N->getOperand(2))) + if (Imm->getZExtValue() > 1) + return; + + SDLoc DL(N); + EVT VT = N->getValueType(0); + SDValue Ops[] = {N->getOperand(1), N->getOperand(2)}; + SDNode *WhilePair = CurDAG->getMachineNode(Opc, DL, MVT::Untyped, Ops); + SDValue SuperReg = SDValue(WhilePair, 0); + + for (unsigned I = 0; I < 2; ++I) + ReplaceUses(SDValue(N, I), CurDAG->getTargetExtractSubreg( + AArch64::psub0 + I, DL, VT, SuperReg)); + + CurDAG->RemoveDeadNode(N); +} + void AArch64DAGToDAGISel::SelectWhilePair(SDNode *N, unsigned Opc) { SDLoc DL(N); EVT VT = N->getValueType(0); @@ -1686,11 +1723,13 @@ void AArch64DAGToDAGISel::SelectDestructiveMultiIntrinsic(SDNode *N, unsigned NumVecs, bool IsZmMulti, - unsigned Opcode) { + unsigned Opcode, + bool HasPred) { assert(Opcode != 0 && "Unexpected opcode"); SDLoc DL(N); EVT VT = N->getValueType(0); + unsigned FirstVecIdx = HasPred ? 2 : 1; auto GetMultiVecOperand = [=](unsigned StartIdx) { SmallVector Regs(N->op_begin() + StartIdx, @@ -1698,16 +1737,20 @@ return createZMulTuple(Regs); }; - SDValue Zdn = GetMultiVecOperand(1); + SDValue Zdn = GetMultiVecOperand(FirstVecIdx); SDValue Zm; if (IsZmMulti) - Zm = GetMultiVecOperand(NumVecs + 1); + Zm = GetMultiVecOperand(NumVecs + FirstVecIdx); else - Zm = N->getOperand(NumVecs + 1); - - SDNode *Intrinsic = CurDAG->getMachineNode(Opcode, DL, MVT::Untyped, Zdn, Zm); + Zm = N->getOperand(NumVecs + FirstVecIdx); + SDNode *Intrinsic; + if (HasPred) + Intrinsic = CurDAG->getMachineNode(Opcode, DL, MVT::Untyped, + N->getOperand(1), Zdn, Zm); + else + Intrinsic = CurDAG->getMachineNode(Opcode, DL, MVT::Untyped, Zdn, Zm); SDValue SuperReg = SDValue(Intrinsic, 0); for (unsigned i = 0; i < NumVecs; ++i) ReplaceUses(SDValue(N, i), CurDAG->getTargetExtractSubreg( @@ -1749,6 +1792,38 @@ CurDAG->RemoveDeadNode(N); } +void AArch64DAGToDAGISel::SelectContiguousMultiVectorLoad(SDNode *N, + unsigned NumVecs, + unsigned Scale, + unsigned Opc_ri) { + assert(Scale < 4 && "Invalid scaling value."); + SDLoc DL(N); + EVT VT = N->getValueType(0); + SDValue Chain = N->getOperand(0); + + // Use simplest addressing mode for now - base + 0 offset + SDValue PNg = N->getOperand(2); + SDValue Base = N->getOperand(3); + SDValue Offset = CurDAG->getTargetConstant(0, DL, MVT::i64); + + SDValue Ops[] = {PNg, // Predicate-as-counter + Base, // Memory operand + Offset, Chain}; + + const EVT ResTys[] = {MVT::Untyped, MVT::Other}; + + SDNode *Load = CurDAG->getMachineNode(Opc_ri, DL, ResTys, Ops); + SDValue SuperReg = SDValue(Load, 0); + for (unsigned i = 0; i < NumVecs; ++i) + ReplaceUses(SDValue(N, i), CurDAG->getTargetExtractSubreg( + AArch64::zsub0 + i, DL, VT, SuperReg)); + + // Copy chain + unsigned ChainIdx = NumVecs; + ReplaceUses(SDValue(N, ChainIdx), SDValue(Load, 1)); + CurDAG->RemoveDeadNode(N); +} + void AArch64DAGToDAGISel::SelectFrintFromVT(SDNode *N, unsigned NumVecs, unsigned Opcode) { if (N->getValueType(0) != MVT::nxv4f32) @@ -1756,6 +1831,34 @@ SelectUnaryMultiIntrinsic(N, NumVecs, true, Opcode); } +template +void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node, + unsigned NumOutVecs, + unsigned Opc) { + if (ConstantSDNode *Imm = dyn_cast(Node->getOperand(4))) + if (Imm->getZExtValue() > Max) + return; + + SDValue ZtValue; + ImmToTile(Node->getOperand(2), ZtValue); + SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4)}; + SDLoc DL(Node); + EVT VT = Node->getValueType(0); + + SDNode *Instruction = + CurDAG->getMachineNode(Opc, DL, {MVT::Untyped, MVT::Other}, Ops); + SDValue SuperReg = SDValue(Instruction, 0); + + for (unsigned i = 0; i < NumOutVecs; ++i) + ReplaceUses(SDValue(Node, i), CurDAG->getTargetExtractSubreg( + AArch64::zsub0 + i, DL, VT, SuperReg)); + + // Copy chain + unsigned ChainIdx = NumOutVecs; + ReplaceUses(SDValue(Node, ChainIdx), SDValue(Instruction, 1)); + CurDAG->RemoveDeadNode(Node); +} + void AArch64DAGToDAGISel::SelectClamp(SDNode *N, unsigned NumVecs, unsigned Op) { SDLoc DL(N); @@ -4625,6 +4728,74 @@ } break; } + case Intrinsic::aarch64_sve_ld1_pn_vg2: { + if (VT == MVT::nxv16i8) { + SelectContiguousMultiVectorLoad(Node, 2, 0, AArch64::LD1B_2Z_IMM); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + VT == MVT::nxv8bf16) { + SelectContiguousMultiVectorLoad(Node, 2, 1, AArch64::LD1H_2Z_IMM); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectContiguousMultiVectorLoad(Node, 2, 2, AArch64::LD1W_2Z_IMM); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectContiguousMultiVectorLoad(Node, 2, 3, AArch64::LD1D_2Z_IMM); + return; + } + break; + } + case Intrinsic::aarch64_sve_ld1_pn_vg4: { + if (VT == MVT::nxv16i8) { + SelectContiguousMultiVectorLoad(Node, 4, 0, AArch64::LD1B_4Z_IMM); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + VT == MVT::nxv8bf16) { + SelectContiguousMultiVectorLoad(Node, 4, 1, AArch64::LD1H_4Z_IMM); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectContiguousMultiVectorLoad(Node, 4, 2, AArch64::LD1W_4Z_IMM); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectContiguousMultiVectorLoad(Node, 4, 3, AArch64::LD1D_4Z_IMM); + return; + } + break; + } + case Intrinsic::aarch64_sve_ldnt1_pn_vg2: { + if (VT == MVT::nxv16i8) { + SelectContiguousMultiVectorLoad(Node, 2, 0, AArch64::LDNT1B_2Z_IMM); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + VT == MVT::nxv8bf16) { + SelectContiguousMultiVectorLoad(Node, 2, 1, AArch64::LDNT1H_2Z_IMM); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectContiguousMultiVectorLoad(Node, 2, 2, AArch64::LDNT1W_2Z_IMM); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectContiguousMultiVectorLoad(Node, 2, 3, AArch64::LDNT1D_2Z_IMM); + return; + } + break; + } + case Intrinsic::aarch64_sve_ldnt1_pn_vg4: { + if (VT == MVT::nxv16i8) { + SelectContiguousMultiVectorLoad(Node, 4, 0, AArch64::LDNT1B_4Z_IMM); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + VT == MVT::nxv8bf16) { + SelectContiguousMultiVectorLoad(Node, 4, 1, AArch64::LDNT1H_4Z_IMM); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectContiguousMultiVectorLoad(Node, 4, 2, AArch64::LDNT1W_4Z_IMM); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectContiguousMultiVectorLoad(Node, 4, 3, AArch64::LDNT1D_4Z_IMM); + return; + } + break; + } case Intrinsic::aarch64_sve_ld3_sret: { if (VT == MVT::nxv16i8) { SelectPredicatedLoad(Node, 3, 0, AArch64::LD3B_IMM, AArch64::LD3B, @@ -4779,6 +4950,41 @@ MF.getInfo()->setHasSwiftAsyncContext(true); return; } + case Intrinsic::aarch64_sme_luti2_lane_zt_x2: { + if (auto Opc = SelectOpcodeFromVT( + Node->getValueType(0), + {AArch64::LUTI2_2ZTZI_B, AArch64::LUTI2_2ZTZI_H, + AArch64::LUTI2_2ZTZI_S})) + // Second Immediate must be <= 7: + SelectMultiVectorLuti<7>(Node, 2, Opc); + return; + } + case Intrinsic::aarch64_sme_luti4_lane_zt_x2: { + if (auto Opc = SelectOpcodeFromVT( + Node->getValueType(0), + {AArch64::LUTI4_2ZTZI_B, AArch64::LUTI4_2ZTZI_H, + AArch64::LUTI4_2ZTZI_S})) + // Second Immediate must be <= 3: + SelectMultiVectorLuti<3>(Node, 2, Opc); + return; + } + case Intrinsic::aarch64_sme_luti2_lane_zt_x4: { + if (auto Opc = SelectOpcodeFromVT( + Node->getValueType(0), + {AArch64::LUTI2_4ZTZI_B, AArch64::LUTI2_4ZTZI_H, + AArch64::LUTI2_4ZTZI_S})) + // Second Immediate must be <= 3: + SelectMultiVectorLuti<3>(Node, 4, Opc); + return; + } + case Intrinsic::aarch64_sme_luti4_lane_zt_x4: { + if (auto Opc = SelectOpcodeFromVT( + Node->getValueType(0), + {0, AArch64::LUTI4_4ZTZI_H, AArch64::LUTI4_4ZTZI_S})) + // Second Immediate must be <= 1: + SelectMultiVectorLuti<1>(Node, 4, Opc); + return; + } } } break; case ISD::INTRINSIC_WO_CHAIN: { @@ -4973,6 +5179,11 @@ AArch64::UMAX_VG2_2ZZ_S, AArch64::UMAX_VG2_2ZZ_D})) SelectDestructiveMultiIntrinsic(Node, 2, false, Op); return; + case Intrinsic::aarch64_sve_bfmax_single_x2: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), {0, AArch64::BFMAX_VG2_2ZZ_H, 0, 0})) + SelectDestructiveMultiIntrinsic(Node, 2, false, Op); + return; case Intrinsic::aarch64_sve_fmax_single_x2: if (auto Op = SelectOpcodeFromVT( Node->getValueType(0), @@ -4994,6 +5205,11 @@ AArch64::UMAX_VG4_4ZZ_S, AArch64::UMAX_VG4_4ZZ_D})) SelectDestructiveMultiIntrinsic(Node, 4, false, Op); return; + case Intrinsic::aarch64_sve_bfmax_single_x4: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), {0, AArch64::BFMAX_VG4_4ZZ_H, 0, 0})) + SelectDestructiveMultiIntrinsic(Node, 4, false, Op); + return; case Intrinsic::aarch64_sve_fmax_single_x4: if (auto Op = SelectOpcodeFromVT( Node->getValueType(0), @@ -5015,6 +5231,11 @@ AArch64::UMIN_VG2_2ZZ_S, AArch64::UMIN_VG2_2ZZ_D})) SelectDestructiveMultiIntrinsic(Node, 2, false, Op); return; + case Intrinsic::aarch64_sve_bfmin_single_x2: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), {0, AArch64::BFMIN_VG2_2ZZ_H, 0, 0})) + SelectDestructiveMultiIntrinsic(Node, 2, false, Op); + return; case Intrinsic::aarch64_sve_fmin_single_x2: if (auto Op = SelectOpcodeFromVT( Node->getValueType(0), @@ -5036,6 +5257,11 @@ AArch64::UMIN_VG4_4ZZ_S, AArch64::UMIN_VG4_4ZZ_D})) SelectDestructiveMultiIntrinsic(Node, 4, false, Op); return; + case Intrinsic::aarch64_sve_bfmin_single_x4: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), {0, AArch64::BFMIN_VG4_4ZZ_H, 0, 0})) + SelectDestructiveMultiIntrinsic(Node, 4, false, Op); + return; case Intrinsic::aarch64_sve_fmin_single_x4: if (auto Op = SelectOpcodeFromVT( Node->getValueType(0), @@ -5057,6 +5283,11 @@ AArch64::UMAX_VG2_2Z2Z_S, AArch64::UMAX_VG2_2Z2Z_D})) SelectDestructiveMultiIntrinsic(Node, 2, true, Op); return; + case Intrinsic::aarch64_sve_bfmax_x2: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), {0, AArch64::BFMAX_VG2_2Z2Z_H, 0, 0})) + SelectDestructiveMultiIntrinsic(Node, 2, true, Op); + return; case Intrinsic::aarch64_sve_fmax_x2: if (auto Op = SelectOpcodeFromVT( Node->getValueType(0), @@ -5078,6 +5309,11 @@ AArch64::UMAX_VG4_4Z4Z_S, AArch64::UMAX_VG4_4Z4Z_D})) SelectDestructiveMultiIntrinsic(Node, 4, true, Op); return; + case Intrinsic::aarch64_sve_bfmax_x4: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), {0, AArch64::BFMAX_VG4_4Z2Z_H, 0, 0})) + SelectDestructiveMultiIntrinsic(Node, 4, true, Op); + return; case Intrinsic::aarch64_sve_fmax_x4: if (auto Op = SelectOpcodeFromVT( Node->getValueType(0), @@ -5099,6 +5335,11 @@ AArch64::UMIN_VG2_2Z2Z_S, AArch64::UMIN_VG2_2Z2Z_D})) SelectDestructiveMultiIntrinsic(Node, 2, true, Op); return; + case Intrinsic::aarch64_sve_bfmin_x2: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), {0, AArch64::BFMIN_VG2_2Z2Z_H, 0, 0})) + SelectDestructiveMultiIntrinsic(Node, 2, true, Op); + return; case Intrinsic::aarch64_sve_fmin_x2: if (auto Op = SelectOpcodeFromVT( Node->getValueType(0), @@ -5120,6 +5361,11 @@ AArch64::UMIN_VG4_4Z4Z_S, AArch64::UMIN_VG4_4Z4Z_D})) SelectDestructiveMultiIntrinsic(Node, 4, true, Op); return; + case Intrinsic::aarch64_sve_bfmin_x4: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), {0, AArch64::BFMIN_VG4_4Z2Z_H, 0, 0})) + SelectDestructiveMultiIntrinsic(Node, 4, true, Op); + return; case Intrinsic::aarch64_sve_fmin_x4: if (auto Op = SelectOpcodeFromVT( Node->getValueType(0), @@ -5307,6 +5553,20 @@ SelectUnaryMultiIntrinsic(Node, 4, /*IsTupleInput=*/true, AArch64::UZP_VG4_4Z4Z_Q); return; + case Intrinsic::aarch64_sve_sel_x2: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), + {AArch64::SEL_VG2_2ZC2Z2Z_B, AArch64::SEL_VG2_2ZC2Z2Z_H, + AArch64::SEL_VG2_2ZC2Z2Z_S, AArch64::SEL_VG2_2ZC2Z2Z_D})) + SelectDestructiveMultiIntrinsic(Node, 2, true, Op, /*HasPred=*/true); + return; + case Intrinsic::aarch64_sve_sel_x4: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), + {AArch64::SEL_VG4_4ZC4Z4Z_B, AArch64::SEL_VG4_4ZC4Z4Z_H, + AArch64::SEL_VG4_4ZC4Z4Z_S, AArch64::SEL_VG4_4ZC4Z4Z_D})) + SelectDestructiveMultiIntrinsic(Node, 4, true, Op, /*HasPred=*/true); + return; case Intrinsic::aarch64_sve_frinta_x2: SelectFrintFromVT(Node, 2, AArch64::FRINTA_2Z2Z_S); return; @@ -5359,6 +5619,14 @@ AArch64::UUNPK_VG4_4Z2Z_D})) SelectUnaryMultiIntrinsic(Node, 4, /*IsTupleInput=*/true, Op); return; + case Intrinsic::aarch64_sve_pext_x2: { + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), + {AArch64::PEXT_2PCI_B, AArch64::PEXT_2PCI_H, AArch64::PEXT_2PCI_S, + AArch64::PEXT_2PCI_D})) + SelectPExtPair(Node, Op); + return; + } } break; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -68,6 +68,8 @@ SMSTART, SMSTOP, RESTORE_ZA, + RESTORE_ZT, + SAVE_ZT, // Produces the full sequence of instructions for getting the thread pointer // offset of a variable into X0, using the TLSDesc model. @@ -606,10 +608,21 @@ MachineBasicBlock *EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const; - MachineBasicBlock *EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitMopa(unsigned Opc, unsigned BaseReg, MachineInstr &MI, + MachineBasicBlock *BB) const; + MachineBasicBlock *EmitZASpillFill(MachineInstr &MI, MachineBasicBlock *BB, + bool IsSpill) const; + MachineBasicBlock *EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB, + unsigned Opcode, bool IsZTDest) const; MachineBasicBlock *EmitZAInstr(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB, bool HasTile) const; + MachineBasicBlock *EmitAddVectorToTile(unsigned Opc, unsigned BaseReg, + MachineInstr &MI, + MachineBasicBlock *BB) const; + MachineBasicBlock *EmitInsertVectorToTile(unsigned Opc, unsigned BaseReg, + MachineInstr &MI, + MachineBasicBlock *BB) const; MachineBasicBlock *EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock * diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2255,6 +2255,8 @@ MAKE_CASE(AArch64ISD::SMSTART) MAKE_CASE(AArch64ISD::SMSTOP) MAKE_CASE(AArch64ISD::RESTORE_ZA) + MAKE_CASE(AArch64ISD::RESTORE_ZT) + MAKE_CASE(AArch64ISD::SAVE_ZT) MAKE_CASE(AArch64ISD::CALL) MAKE_CASE(AArch64ISD::ADRP) MAKE_CASE(AArch64ISD::ADR) @@ -2644,13 +2646,18 @@ return BB; } -MachineBasicBlock * -AArch64TargetLowering::EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const { +MachineBasicBlock *AArch64TargetLowering::EmitZASpillFill(MachineInstr &MI, + MachineBasicBlock *BB, + bool IsSpill) const { const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineInstrBuilder MIB = - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA)); + BuildMI(*BB, MI, MI.getDebugLoc(), + TII->get(IsSpill ? AArch64::STR_ZA : AArch64::LDR_ZA)); - MIB.addReg(AArch64::ZA, RegState::Define); + if (IsSpill) + MIB.addReg(AArch64::ZA); + else + MIB.addReg(AArch64::ZA, RegState::Define); MIB.add(MI.getOperand(0)); // Vector select register MIB.add(MI.getOperand(1)); // Vector select offset MIB.add(MI.getOperand(2)); // Base @@ -2660,6 +2667,63 @@ return BB; } +MachineBasicBlock * +AArch64TargetLowering::EmitMopa(unsigned Opc, unsigned BaseReg, + MachineInstr &MI, MachineBasicBlock *BB) const { + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc)); + + MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define); + MIB.addReg(BaseReg + MI.getOperand(0).getImm()); + MIB.add(MI.getOperand(1)); // pn + MIB.add(MI.getOperand(2)); // pm + MIB.add(MI.getOperand(3)); // zn + MIB.add(MI.getOperand(4)); // zm + + MI.eraseFromParent(); // The pseudo is gone now. + return BB; +} + +MachineBasicBlock * +AArch64TargetLowering::EmitInsertVectorToTile(unsigned Opc, unsigned BaseReg, + MachineInstr &MI, + MachineBasicBlock *BB) const { + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc)); + + MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define); + MIB.addReg(BaseReg + MI.getOperand(0).getImm()); + MIB.add(MI.getOperand(1)); // Slice index register + MIB.add(MI.getOperand(2)); // Slice index offset + MIB.add(MI.getOperand(3)); // pg + MIB.add(MI.getOperand(4)); // zn + + MI.eraseFromParent(); // The pseudo is gone now. + return BB; +} + +MachineBasicBlock *AArch64TargetLowering::EmitZTInstr(MachineInstr &MI, + MachineBasicBlock *BB, + unsigned Opcode, + bool IsZTDest) const { + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineInstrBuilder MIB; + + if (IsZTDest) + MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode), + MI.getOperand(0).getReg()); + else { + MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode)); + MIB.addReg(MI.getOperand(0).getReg()); + } + + for (unsigned I = 1; I < MI.getNumOperands(); ++I) + MIB.add(MI.getOperand(I)); + + MI.eraseFromParent(); // The pseudo is gone now. + return BB; +} + MachineBasicBlock * AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg, MachineInstr &MI, @@ -2771,9 +2835,17 @@ case AArch64::LD1_MXIPXX_V_PSEUDO_Q: return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB); case AArch64::LDR_ZA_PSEUDO: - return EmitFill(MI, BB); + return EmitZASpillFill(MI, BB, /*IsSpill=*/false); + case AArch64::STR_ZA_PSEUDO: + return EmitZASpillFill(MI, BB, /*IsSpill=*/true); + case AArch64::LDR_TX_PSEUDO: + return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*IsZTDest=*/true); + case AArch64::STR_TX_PSEUDO: + return EmitZTInstr(MI, BB, AArch64::STR_TX, /*IsZTDest=*/false); case AArch64::ZERO_M_PSEUDO: return EmitZero(MI, BB); + case AArch64::ZERO_T_PSEUDO: + return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*IsZTDest=*/true); } } @@ -5008,6 +5080,7 @@ case Intrinsic::aarch64_sve_clz: return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); + case Intrinsic::aarch64_sme_get_live_za_slices: case Intrinsic::aarch64_sme_cntsb: return DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), DAG.getConstant(1, dl, MVT::i32)); @@ -5039,8 +5112,12 @@ case Intrinsic::aarch64_sve_dupq_lane: return LowerDUPQLane(Op, DAG); case Intrinsic::aarch64_sve_convert_from_svbool: + if (Op.getValueType() == MVT::aarch64svcount) + return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Op.getOperand(1)); return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG); case Intrinsic::aarch64_sve_convert_to_svbool: + if (Op.getOperand(1).getValueType() == MVT::aarch64svcount) + return DAG.getNode(ISD::BITCAST, dl, MVT::nxv16i1, Op.getOperand(1)); return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG); case Intrinsic::aarch64_sve_fneg: return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(), @@ -6232,6 +6309,17 @@ DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); Chain = DAG.getStore(Chain, DL, Buffer, Ptr, MPI); + // Set the reserved bytes (10-15) to zero + SDValue ReservedPtr = + DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, + DAG.getConstant(10, DL, Ptr.getValueType())); + Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i16), + ReservedPtr, MPI); + ReservedPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, + DAG.getConstant(12, DL, Ptr.getValueType())); + Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i32), + ReservedPtr, MPI); + return TPIDR2Obj; } @@ -6291,7 +6379,7 @@ (void)Res; } - SMEAttrs Attrs(MF.getFunction()); + SMEAttrs Attrs(MF.getFunction(), Subtarget->hasSME2()); bool IsLocallyStreaming = !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody(); assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value"); @@ -6625,7 +6713,7 @@ Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF); // Conservatively assume the function requires the lazy-save mechanism. - if (SMEAttrs(MF.getFunction()).hasZAState()) { + if (SMEAttrs(MF.getFunction(), Subtarget->hasSME2()).hasZAState()) { unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG); FuncInfo->setLazySaveTPIDR2Obj(TPIDR2Obj); } @@ -6871,7 +6959,7 @@ // SME Streaming functions are not eligible for TCO as they may require // the streaming mode or ZA to be restored after returning from the call. - SMEAttrs CallerAttrs(MF.getFunction()); + SMEAttrs CallerAttrs(MF.getFunction(), Subtarget->hasSME2()); auto CalleeAttrs = CLI.CB ? SMEAttrs(*CLI.CB) : SMEAttrs(SMEAttrs::Normal); if (CallerAttrs.requiresSMChange(CalleeAttrs) || CallerAttrs.requiresLazySave(CalleeAttrs)) @@ -7203,7 +7291,7 @@ } // Determine whether we need any streaming mode changes. - SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction()); + SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction(), Subtarget->hasSME2()); if (CLI.CB) CalleeAttrs = SMEAttrs(*CLI.CB); else if (std::optional Attrs = @@ -7212,22 +7300,29 @@ bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs); + // The SMEABIPass handles lazy-saves around Invokes, so they do not need + // to be handled again here. + if (RequiresLazySave && CLI.CB && isa(CLI.CB)) + RequiresLazySave = false; + MachineFrameInfo &MFI = MF.getFrameInfo(); if (RequiresLazySave) { // Set up a lazy save mechanism by storing the runtime live slices - // (worst-case N*N) to the TPIDR2 stack object. - SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, - DAG.getConstant(1, DL, MVT::i32)); - SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N); - unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj(); + // to the TPIDR2 stack object. + SDValue LiveSlices = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, + DAG.getConstant(Intrinsic::aarch64_sme_get_live_za_slices, + DL, MVT::i32)); + unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj(); MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj); SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); SDValue BufferPtrAddr = DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr, DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType())); - Chain = DAG.getTruncStore(Chain, DL, NN, BufferPtrAddr, MPI, MVT::i16); + Chain = + DAG.getTruncStore(Chain, DL, LiveSlices, BufferPtrAddr, MPI, MVT::i16); Chain = DAG.getNode( ISD::INTRINSIC_VOID, DL, MVT::Other, Chain, DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), @@ -7240,6 +7335,21 @@ if (RequiresSMChange) PStateSM = getPStateSM(DAG, Chain, CallerAttrs, DL, MVT::i64); + bool PreserveZT = CallerAttrs.requiresPreservingZT(CalleeAttrs); + + SDValue ZTFrameIdx; + + if (PreserveZT) { + unsigned ZTObj = MFI.CreateSpillStackObject(64, Align(16)); + ZTFrameIdx = DAG.getFrameIndex( + ZTObj, + DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); + + Chain = DAG.getNode( + AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other), + {Chain.getValue(0), DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx}); + } + // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass if (!IsSibCall) @@ -7657,7 +7767,14 @@ AArch64ISD::SMSTART, DL, MVT::Other, Result, DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32), DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64)); + } + + if (PreserveZT) + Result = DAG.getNode( + AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other), + {Result.getValue(0), DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx}); + if (RequiresLazySave) { // Conditionally restore the lazy save using a pseudo node. unsigned FI = FuncInfo->getLazySaveTPIDR2Obj(); SDValue RegMask = DAG.getRegisterMask( @@ -7688,7 +7805,7 @@ DAG.getConstant(0, DL, MVT::i64)); } - if (RequiresSMChange || RequiresLazySave) { + if (RequiresSMChange || RequiresLazySave || PreserveZT) { for (unsigned I = 0; I < InVals.size(); ++I) { // The smstart/smstop is chained as part of the call, but when the // resulting chain is discarded (which happens when the call is not part @@ -7781,7 +7898,7 @@ const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); // Emit SMSTOP before returning from a locally streaming function - SMEAttrs FuncAttrs(MF.getFunction()); + SMEAttrs FuncAttrs(MF.getFunction(), Subtarget->hasSME2()); if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) { Chain = DAG.getNode( AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain, @@ -22584,6 +22701,34 @@ return DAG.getMergeValues( {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL); } + case Intrinsic::aarch64_sme_invoke_resume_pstatesm: { + SDLoc DL(N); + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); + SmallVector SMOps; + SMOps.push_back(/*Chain*/ N->getOperand(0)); + SMOps.push_back( + DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRSM), DL, MVT::i32)); + assert(N->getOperand(0) == DAG.getEntryNode() && "Unexpected Chain value"); + auto *ConstOp = dyn_cast(N->getOperand(2)); + if (ConstOp && ConstOp->getZExtValue() == 1) + SMOps.push_back(DAG.getConstant(1, DL, MVT::i64)); + else + SMOps.push_back(/*PState.SM on entry*/ N->getOperand(2).getValue(0)); + SMOps.push_back(DAG.getTargetConstant(0, DL, MVT::i64)); + SMOps.push_back( + DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask())); + SMOps.push_back(/*Glue*/N->getOperand(0).getValue(1)); + return DAG.getNode(AArch64ISD::SMSTART, DL, + DAG.getVTList(MVT::Other, MVT::Glue), SMOps); + } + case Intrinsic::aarch64_sme_ldr_zt: + return DAG.getNode(AArch64ISD::RESTORE_ZT, SDLoc(N), + DAG.getVTList(MVT::Other), N->getOperand(0), + N->getOperand(2), N->getOperand(3)); + case Intrinsic::aarch64_sme_str_zt: + return DAG.getNode(AArch64ISD::SAVE_ZT, SDLoc(N), + DAG.getVTList(MVT::Other), N->getOperand(0), + N->getOperand(2), N->getOperand(3)); default: break; } @@ -23966,7 +24111,7 @@ // Checks to allow the use of SME instructions if (auto *Base = dyn_cast(&Inst)) { - auto CallerAttrs = SMEAttrs(*Inst.getFunction()); + auto CallerAttrs = SMEAttrs(*Inst.getFunction(), Subtarget->hasSME2()); auto CalleeAttrs = SMEAttrs(*Base); if (CallerAttrs.requiresSMChange(CalleeAttrs, /*BodyOverridesInterface=*/false) || diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -1435,11 +1435,11 @@ // SME2 vector select offset operands -// uimm3s8 predicate +// tuimm3s8_32 predicate // True if the immediate is a multiple of 8 in the range [0,56]. def UImm3s8Operand : UImmScaledMemoryIndexed<3, 8>; -def uimm3s8 : Operand, ImmLeaf, TImmLeaf= 0 && Imm <= 56 && ((Imm % 8) == 0); }], UImmS8XForm> { let PrintMethod = "printVectorIndex<8>"; let ParserMatchClass = UImm3s8Operand; @@ -12130,3 +12130,4 @@ def : TokenAlias<".S", ".s">; def : TokenAlias<".D", ".d">; def : TokenAlias<".Q", ".q">; +def : TokenAlias<".2B", ".2b">; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -3105,6 +3105,12 @@ MinOffset = 0; MaxOffset = 63; break; + case AArch64::LDR_ZA: + case AArch64::STR_ZA: + Scale = TypeSize::Scalable(16); + Width = SVEMaxBytesPerVector; + MinOffset = 0; + MaxOffset = 15; } return true; diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -432,6 +432,12 @@ Reserved.set(SubReg); } + if (MF.getSubtarget().hasSME2()) { + for (MCSubRegIterator SubReg(AArch64::ZT0, this, /*self=*/true); + SubReg.isValid(); ++SubReg) + Reserved.set(*SubReg); + } + markSuperRegs(Reserved, AArch64::FPCR); assert(checkAllSuperRegsMarked(Reserved)); diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -22,6 +22,12 @@ [SDTCisInt<0>, SDTCisPtrTy<1>]>, [SDNPHasChain, SDNPSideEffect, SDNPVariadic, SDNPOptInGlue]>; +def AArch64_restore_zt : SDNode<"AArch64ISD::RESTORE_ZT", SDTypeProfile<0, 2, + [SDTCisInt<0>, SDTCisPtrTy<1>]>, + [SDNPHasChain, SDNPSideEffect, SDNPMayLoad]>; +def AArch64_save_zt : SDNode<"AArch64ISD::SAVE_ZT", SDTypeProfile<0, 2, + [SDTCisInt<0>, SDTCisPtrTy<1>]>, + [SDNPHasChain, SDNPSideEffect, SDNPMayStore]>; def AArch64ObscureCopy : SDNode<"AArch64ISD::OBSCURE_COPY", SDTypeProfile<1, 1, []>, []>; @@ -572,19 +578,19 @@ defm UMOPA_MPPZZ_HtoS : sme2_int_mopx_tile<"umopa", 0b100, int_aarch64_sme_umopa_za32>; defm UMOPS_MPPZZ_HtoS : sme2_int_mopx_tile<"umops", 0b101, int_aarch64_sme_umops_za32>; -def ZERO_T : sme2_zero_zt<"zero", 0b0001>; +defm ZERO_T : sme2_zero_zt<"zero", 0b0001>; -def LDR_TX : sme2_spill_fill_vector<"ldr", 0b01111100>; -def STR_TX : sme2_spill_fill_vector<"str", 0b11111100>; +defm LDR_TX : sme2_spill_fill_vector<"ldr", 0b01111100, AArch64_restore_zt>; +defm STR_TX : sme2_spill_fill_vector<"str", 0b11111100, AArch64_save_zt>; def MOVT_XTI : sme2_movt_zt_to_scalar<"movt", 0b0011111>; def MOVT_TIX : sme2_movt_scalar_to_zt<"movt", 0b0011111>; -defm LUTI2_ZTZI : sme2_luti2_vector_index<"luti2">; +defm LUTI2_ZTZI : sme2_luti2_vector_index<"luti2", int_aarch64_sme_luti2_lane_zt>; defm LUTI2_2ZTZI : sme2_luti2_vector_vg2_index<"luti2">; defm LUTI2_4ZTZI : sme2_luti2_vector_vg4_index<"luti2">; -defm LUTI4_ZTZI : sme2_luti4_vector_index<"luti4">; +defm LUTI4_ZTZI : sme2_luti4_vector_index<"luti4", int_aarch64_sme_luti4_lane_zt>; defm LUTI4_2ZTZI : sme2_luti4_vector_vg2_index<"luti4">; defm LUTI4_4ZTZI : sme2_luti4_vector_vg4_index<"luti4">; @@ -632,8 +638,8 @@ defm UQRSHRN_VG4_Z4ZI : sme2_sat_shift_vector_vg4<"uqrshrn", 0b101, int_aarch64_sve_uqrshrn_x4>; defm SQRSHRUN_VG4_Z4ZI : sme2_sat_shift_vector_vg4<"sqrshrun", 0b110, int_aarch64_sve_sqrshrun_x4>; -defm SEL_VG2_2ZP2Z2Z: sme2_sel_vector_vg2<"sel">; -defm SEL_VG4_4ZP4Z4Z: sme2_sel_vector_vg4<"sel">; +defm SEL_VG2_2ZC2Z2Z: sme2_sel_vector_vg2<"sel">; +defm SEL_VG4_4ZC4Z4Z: sme2_sel_vector_vg4<"sel">; def LD1B_VG2_M2ZPXX : sme2_ld_vector_vg2_multi_scalar_scalar<0b00, 0b0, ZZ_b_strided, GPR64shifted8, "ld1b">; def LD1B_VG4_M4ZPXX : sme2_ld_vector_vg4_multi_scalar_scalar<0b00, 0b0, ZZZZ_b_strided, GPR64shifted8, "ld1b">; diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -2792,6 +2792,7 @@ } defm Pat_Store_P16 : unpred_store_predicate; + defm Pat_Store_PredAsCount : unpred_store_predicate; multiclass unpred_load_predicate { def _fi : Pat<(Ty (load (am_sve_fi GPR64sp:$base, simm9:$offset))), @@ -2802,6 +2803,7 @@ } defm Pat_Load_P16 : unpred_load_predicate; + defm Pat_Load_PredAsCount : unpred_load_predicate; multiclass ld1 { @@ -3741,10 +3743,10 @@ defm FCLAMP_ZZZ : sve2p1_fclamp<"fclamp", int_aarch64_sve_fclamp>; defm FDOT_ZZZ_S : sve_float_dot<0b0, "fdot", nxv8f16, int_aarch64_sve_fdot_x2>; defm FDOT_ZZZI_S : sve_float_dot_indexed<0b0, "fdot", nxv8f16, int_aarch64_sve_fdot_lane_x2>; -def BFMLSLB_ZZZ_S : sve2_fp_mla_long<0b110, "bfmlslb">; -def BFMLSLT_ZZZ_S : sve2_fp_mla_long<0b111, "bfmlslt">; -def BFMLSLB_ZZZI_S : sve2_fp_mla_long_by_indexed_elem<0b110, "bfmlslb">; -def BFMLSLT_ZZZI_S : sve2_fp_mla_long_by_indexed_elem<0b111, "bfmlslt">; +defm BFMLSLB_ZZZ_S : sve_bfloat_matmul_longvecl<0b0, 0b1, "bfmlslb", int_aarch64_sve_bfmlslb>; +defm BFMLSLT_ZZZ_S : sve_bfloat_matmul_longvecl<0b1, 0b1, "bfmlslt", int_aarch64_sve_bfmlslt>; +defm BFMLSLB_ZZZI_S : sve_bfloat_matmul_longvecl_idx<0b0, 0b1, "bfmlslb", int_aarch64_sve_bfmlslb_lane>; +defm BFMLSLT_ZZZI_S : sve_bfloat_matmul_longvecl_idx<0b1, 0b1, "bfmlslt", int_aarch64_sve_bfmlslt_lane>; defm SDOT_ZZZ_HtoS : sve2p1_two_way_dot_vv<"sdot", 0b0, int_aarch64_sve_sdot_x2>; defm UDOT_ZZZ_HtoS : sve2p1_two_way_dot_vv<"udot", 0b1, int_aarch64_sve_udot_x2>; @@ -3752,7 +3754,7 @@ defm UDOT_ZZZI_HtoS : sve2p1_two_way_dot_vvi<"udot", 0b1, int_aarch64_sve_udot_lane_x2>; defm CNTP_XCI : sve2p1_pcount_pn<"cntp", 0b000>; -defm PEXT_PCI : sve2p1_pred_as_ctr_to_mask<"pext">; +defm PEXT_PCI : sve2p1_pred_as_ctr_to_mask<"pext", int_aarch64_sve_pext>; defm PEXT_2PCI : sve2p1_pred_as_ctr_to_mask_pair<"pext">; defm PTRUE_C : sve2p1_ptrue_pn<"ptrue">; @@ -3835,6 +3837,61 @@ defm STNT1W_4Z_IMM : sve2p1_mem_cst_si_4z<"stnt1w", 0b10, 0b1, ZZZZ_s_mul_r>; defm STNT1D_4Z_IMM : sve2p1_mem_cst_si_4z<"stnt1d", 0b11, 0b1, ZZZZ_d_mul_r>; + + multiclass store_pn_vg2 { + def : Pat<(Store (Ty ZPR:$vec0), (Ty ZPR:$vec1), + (aarch64svcount PPR:$PNg), GPR64:$base), + (RegImmInst (REG_SEQUENCE ZPR2, Ty:$vec0, zsub0, Ty:$vec1, zsub1), + PPR:$PNg, GPR64:$base, (i64 0))>; + } + + // Stores of 2 consecutive vectors + defm : store_pn_vg2; + defm : store_pn_vg2; + defm : store_pn_vg2; + defm : store_pn_vg2; + defm : store_pn_vg2; + defm : store_pn_vg2; + defm : store_pn_vg2; + defm : store_pn_vg2; + defm : store_pn_vg2; + defm : store_pn_vg2; + defm : store_pn_vg2; + defm : store_pn_vg2; + defm : store_pn_vg2; + defm : store_pn_vg2; + defm : store_pn_vg2; + defm : store_pn_vg2; + + multiclass store_pn_vg4 { + def : Pat<(Store (Ty ZPR:$vec0), (Ty ZPR:$vec1), (Ty ZPR:$vec2), (Ty ZPR:$vec3), + (aarch64svcount PPR:$PNg), GPR64:$base), + (RegImmInst (REG_SEQUENCE ZPR4, Ty:$vec0, zsub0, Ty:$vec1, zsub1, + Ty:$vec2, zsub2, Ty:$vec3, zsub3), + PPR:$PNg, GPR64:$base, (i64 0))>; + } + + // Stores of 4 consecutive vectors + defm : store_pn_vg4; + defm : store_pn_vg4; + defm : store_pn_vg4; + defm : store_pn_vg4; + defm : store_pn_vg4; + defm : store_pn_vg4; + defm : store_pn_vg4; + defm : store_pn_vg4; + defm : store_pn_vg4; + defm : store_pn_vg4; + defm : store_pn_vg4; + defm : store_pn_vg4; + defm : store_pn_vg4; + defm : store_pn_vg4; + defm : store_pn_vg4; + defm : store_pn_vg4; + + defm WHILEGE_2PXX : sve2p1_int_while_rr_pair<"whilege", 0b000>; defm WHILEGT_2PXX : sve2p1_int_while_rr_pair<"whilegt", 0b001>; defm WHILELT_2PXX : sve2p1_int_while_rr_pair<"whilelt", 0b010>; @@ -3884,18 +3941,18 @@ def BFMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b00, 0b00, "bfmla", ZPR16>; def BFMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b00, 0b01, "bfmls", ZPR16>; -def BFADD_ZPZmZ : sve_fp_2op_p_zds<0b00, 0b0000, "bfadd", ZPR16>; -def BFSUB_ZPZmZ : sve_fp_2op_p_zds<0b00, 0b0001, "bfsub", ZPR16>; -def BFMUL_ZPZmZ : sve_fp_2op_p_zds<0b00, 0b0010, "bfmul", ZPR16>; -def BFMAXNM_ZPZmZ : sve_fp_2op_p_zds<0b00, 0b0100, "bfmaxnm", ZPR16>; -def BFMINNM_ZPZmZ : sve_fp_2op_p_zds<0b00, 0b0101, "bfminnm", ZPR16>; -def BFMAX_ZPZmZ : sve_fp_2op_p_zds<0b00, 0b0110, "bfmax", ZPR16>; -def BFMIN_ZPZmZ : sve_fp_2op_p_zds<0b00, 0b0111, "bfmin", ZPR16>; +defm BFADD_ZPZmZ : sve2p1_bf_2op_p_zds<0b00, 0b0000, "bfadd", ZPR16, int_aarch64_sve_fadd>; +defm BFSUB_ZPZmZ : sve2p1_bf_2op_p_zds<0b00, 0b0001, "bfsub", ZPR16, int_aarch64_sve_fsub>; +defm BFMUL_ZPZmZ : sve2p1_bf_2op_p_zds<0b00, 0b0010, "bfmul", ZPR16, int_aarch64_sve_fmul>; +defm BFMAXNM_ZPZmZ : sve2p1_bf_2op_p_zds<0b00, 0b0100, "bfmaxnm", ZPR16, int_aarch64_sve_fmaxnm>; +defm BFMINNM_ZPZmZ : sve2p1_bf_2op_p_zds<0b00, 0b0101, "bfminnm", ZPR16, int_aarch64_sve_fminnm>; +defm BFMAX_ZPZmZ : sve2p1_bf_2op_p_zds<0b00, 0b0110, "bfmax", ZPR16, int_aarch64_sve_fmax>; +defm BFMIN_ZPZmZ : sve2p1_bf_2op_p_zds<0b00, 0b0111, "bfmin", ZPR16, int_aarch64_sve_fmin>; defm BFMLA_ZZZI : sve2p1_fp_bfma_by_indexed_elem<"bfmla", 0b10>; defm BFMLS_ZZZI : sve2p1_fp_bfma_by_indexed_elem<"bfmls", 0b11>; -defm BFMUL_ZZZI : sve2p1_fp_bfmul_by_indexed_elem<"bfmul">; +defm BFMUL_ZZZI : sve2p1_fp_bfmul_by_indexed_elem<"bfmul", int_aarch64_sve_fmul_lane>; def BFCLAMP_ZZZ : sve2p1_fclamp<"bfclamp", 0b00, ZPR16>; } // End HasSVE2p1_or_HasSME2p1, HasB16B16 diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -164,6 +164,11 @@ cl::desc("Enable SVE intrinsic opts"), cl::init(true)); +cl::opt + EnableSMEPeepholeOpt("enable-aarch64-sme-peephole-opt", cl::init(true), + cl::Hidden, + cl::desc("Perform SME peephole optimization")); + static cl::opt EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix", cl::init(true), cl::Hidden); @@ -218,6 +223,7 @@ initializeAArch64KCFIPass(*PR); initializeAArch64LoadStoreOptPass(*PR); initializeAArch64MIPeepholeOptPass(*PR); + initializeSMEPeepholeOptPass(*PR); initializeAArch64SIMDInstrOptPass(*PR); initializeAArch64O0PreLegalizerCombinerPass(*PR); initializeAArch64PreLegalizerCombinerPass(*PR); @@ -710,6 +716,9 @@ } void AArch64PassConfig::addMachineSSAOptimization() { + if (TM->getOptLevel() != CodeGenOpt::None && EnableSMEPeepholeOpt) + addPass(createSMEPeepholeOptPass()); + // Run default MachineSSAOptimization first. TargetPassConfig::addMachineSSAOptimization(); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -9,6 +9,7 @@ #include "AArch64TargetTransformInfo.h" #include "AArch64ExpandImm.h" #include "AArch64PerfectShuffle.h" +#include "Utils/AArch64BaseInfo.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/LoopInfo.h" @@ -130,7 +131,7 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { - SMEAttrs CallerAttrs(*Caller); + SMEAttrs CallerAttrs(*Caller, ST->hasSME2()); SMEAttrs CalleeAttrs(*Callee); if (CallerAttrs.requiresSMChange(CalleeAttrs, /*BodyOverridesInterface=*/true) || @@ -685,6 +686,11 @@ if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II)) return BinOpCombine; + // Ignore converts to/from svcount_t. + if (isa(II.getArgOperand(0)->getType()) || + isa(II.getType())) + return std::nullopt; + SmallVector CandidatesForRemoval; Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr; diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt --- a/llvm/lib/Target/AArch64/CMakeLists.txt +++ b/llvm/lib/Target/AArch64/CMakeLists.txt @@ -75,6 +75,7 @@ AArch64PBQPRegAlloc.cpp AArch64RegisterInfo.cpp AArch64SLSHardening.cpp + SMEPeepholeOpt.cpp AArch64SelectionDAGInfo.cpp AArch64SpeculationHardening.cpp AArch64StackTagging.cpp diff --git a/llvm/lib/Target/AArch64/SMEABIPass.cpp b/llvm/lib/Target/AArch64/SMEABIPass.cpp --- a/llvm/lib/Target/AArch64/SMEABIPass.cpp +++ b/llvm/lib/Target/AArch64/SMEABIPass.cpp @@ -13,10 +13,13 @@ //===----------------------------------------------------------------------===// #include "AArch64.h" +#include "AArch64Subtarget.h" +#include "AArch64TargetMachine.h" #include "Utils/AArch64BaseInfo.h" #include "Utils/AArch64SMEAttributes.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Constants.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" @@ -40,14 +43,31 @@ bool runOnFunction(Function &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + } + private: - bool updateNewZAFunctions(Module *M, Function *F, IRBuilder<> &Builder); + bool updateNewZAFunctions(Module *M, Function *F, IRBuilder<> &Builder, + bool ClearZTState); + bool handleExceptions(IRBuilder<> &Builder, Function *F, StructType *TPITy); + AllocaInst *createZABufferAndTPIDR2Block(IRBuilder<> &Builder, Function *F, + StructType *TPITy, + StringRef ObjName); + void setupLazySave(IRBuilder<> &Builder, Instruction *Call, + AllocaInst *TPIObj, BasicBlock *CheckBB, + BasicBlock *ResumeBB, BasicBlock *RestoreBB); + void restoreLazySave(IRBuilder<> &Builder, Instruction *Call, + AllocaInst *TPIObj, BasicBlock *RestoreBB, + BasicBlock *ResumeBB = nullptr, + Value *PStateOnEntry = nullptr); }; } // end anonymous namespace char SMEABI::ID = 0; static const char *name = "SME ABI Pass"; INITIALIZE_PASS_BEGIN(SMEABI, DEBUG_TYPE, name, false, false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_END(SMEABI, DEBUG_TYPE, name, false, false) FunctionPass *llvm::createSMEABIPass() { return new SMEABI(); } @@ -56,8 +76,42 @@ // Utility functions //===----------------------------------------------------------------------===// +// Utility function to emit a call to __arm_sme_state and return 'pstate.sm' value. +Value *emitGetPStateSM(Module *M, IRBuilder<> &Builder) { + auto *FTy = FunctionType::get( + StructType::create({Builder.getInt64Ty(), Builder.getInt64Ty()}), {}, + /*IsVarArgs=*/false); + auto Attrs = + AttributeList() + .addFnAttribute(M->getContext(), "aarch64_pstate_sm_compatible") + .addFnAttribute(M->getContext(), "aarch64_pstate_za_preserved"); + FunctionCallee Callee = M->getOrInsertFunction("__arm_sme_state", FTy, Attrs); + CallInst *Call = Builder.CreateCall(Callee); + Call->setCallingConv( + CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2); + + // Extract PSTATE.SM from X0 + Value *X0 = Builder.CreateExtractValue(Call, 0); + return Builder.CreateAnd(X0, Builder.getInt64(1)); +} + +// Utility function to emit a call to __arm_tpidr2_restore. +void emitTPIDR2Restore(Module *M, IRBuilder<> &Builder, Value *TPIDR2Obj) { + auto *FTy = FunctionType::get(Builder.getVoidTy(), {Builder.getInt8PtrTy()}, + /*IsVarArgs=*/false); + auto Attrs = + AttributeList() + .addFnAttribute(M->getContext(), "aarch64_pstate_sm_compatible") + .addFnAttribute(M->getContext(), "aarch64_pstate_za_shared"); + FunctionCallee Callee = M->getOrInsertFunction("__arm_tpidr2_restore", FTy, Attrs); + CallInst *Call = Builder.CreateCall( + Callee, Builder.CreatePointerCast(TPIDR2Obj, Builder.getInt8PtrTy())); + Call->setCallingConv( + CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0); +} + // Utility function to emit a call to __arm_tpidr2_save and clear TPIDR2_EL0. -void emitTPIDR2Save(Module *M, IRBuilder<> &Builder) { +void emitTPIDR2Save(Module *M, IRBuilder<> &Builder, bool ClearTPIDR2) { auto *TPIDR2SaveTy = FunctionType::get(Builder.getVoidTy(), {}, /*IsVarArgs=*/false); auto Attrs = @@ -70,11 +124,280 @@ Call->setCallingConv( CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0); - // A save to TPIDR2 should be followed by clearing TPIDR2_EL0. + if (ClearTPIDR2) { + Function *WriteIntr = + Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_set_tpidr2); + Builder.CreateCall(WriteIntr->getFunctionType(), WriteIntr, + Builder.getInt64(0)); + } +} + +void SMEABI::restoreLazySave( + IRBuilder<> &Builder, Instruction *Call, AllocaInst *TPIObj, + BasicBlock *RestoreBB, BasicBlock *ResumeBB, Value *PStateOnEntry) { + Module *M = Call->getModule(); + + // If Call is an Invoke, restore the lazy save in the normal destination. + // Otherwise, restore the lazy save immediately after Call. + if (auto *II = dyn_cast(Call)) + Builder.SetInsertPoint(II->getNormalDest()->getFirstNonPHIOrDbg()); + else + Builder.SetInsertPoint(Call->getParent(), std::next(Call->getIterator())); + + // Re-enable pstate.za. + Function *EnableZAIntr = + Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_za_enable); + Builder.CreateCall(EnableZAIntr->getFunctionType(), EnableZAIntr); + + // Create an intrinsic call to restore ZA, passing the 64-bit data pointer + // to the TPIDR2 block. + if (ResumeBB) + Builder.SetInsertPoint(RestoreBB); + emitTPIDR2Restore(M, Builder, TPIObj); + if (ResumeBB) + Builder.CreateBr(ResumeBB); +} + +void SMEABI::setupLazySave(IRBuilder<> &Builder, Instruction *Call, + AllocaInst *TPIObj, BasicBlock *CheckBB, + BasicBlock *ResumeBB, BasicBlock *RestoreBB) { + Module *M = Call->getModule(); + Builder.SetInsertPoint(Call); + + // Store the number of live slices to the num_za_save_slices field + // of the TPIDR2 block + Function *LiveIntr = + Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_get_live_za_slices); + auto *Live = + Builder.CreateCall(LiveIntr->getFunctionType(), LiveIntr, {}, "live"); + auto *Trunc = Builder.CreateTrunc(Live, Builder.getInt16Ty(), "live.trunc"); + auto *TPILive = Builder.CreateGEP(TPIObj->getAllocatedType(), TPIObj, + {Builder.getInt64(0), Builder.getInt32(1)}, + "tpidr2.obj.live"); + Builder.CreateStore(Trunc, TPILive); + + auto *PtrToInt = + Builder.CreatePtrToInt(TPIObj, Builder.getInt64Ty(), "tpi.int"); + + if (dyn_cast(Call)) { + // Restart pstate.za if this is an Invoke, as we may be setting up + // a lazy-save in the exception handler. + // TODO: This will start pstate.za unnecessarily if the parent block is + // not an unwind destination. It might be possible to improve this by + // creating a mapping of blocks to ZA/SM states. + Function *EnableZAIntr = + Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_za_enable); + Builder.CreateCall(EnableZAIntr->getFunctionType(), EnableZAIntr); + } + + // Set TPIDR2_EL0 to the new object Function *WriteIntr = Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_set_tpidr2); + Builder.CreateCall(WriteIntr->getFunctionType(), WriteIntr, PtrToInt); + + if (!CheckBB) { + // If no CheckBB block was passed in, Call should be an Invoke with shared + // or preserved ZA and we don't need to restore the lazy-save unless we + // catch an exception. Abandon the lazy-save before resuming the function + // at the normal destination. + auto *Invoke = dyn_cast(Call); + assert(Invoke && "CheckBB has not been provided for restoring lazy-save."); + auto *FirstInst = Invoke->getNormalDest()->getFirstNonPHIOrDbg(); + auto *II = dyn_cast(FirstInst); + // Avoid setting TPIDR2_EL0 to null more than once. + if (!II || II->getIntrinsicID() != Intrinsic::aarch64_sme_set_tpidr2) { + Builder.SetInsertPoint(FirstInst); + Builder.CreateCall(WriteIntr->getFunctionType(), WriteIntr, + ConstantInt::get(Builder.getInt64Ty(), 0)); + } + return; + } + + // Check if the lazy save has been committed by the callee and should + // be restored. + Builder.SetInsertPoint(CheckBB); + Function *GetEL0Intr = + Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_get_tpidr2); + auto *GetEL0 = Builder.CreateCall(GetEL0Intr->getFunctionType(), GetEL0Intr, + {}, "tpidr2"); + auto *Cmp = Builder.CreateCmp(ICmpInst::ICMP_EQ, GetEL0, + ConstantInt::get(GetEL0->getType(), 0), "cmp"); + Builder.CreateCondBr(Cmp, RestoreBB, ResumeBB); + + // Conditionally store ZA after the call. + restoreLazySave(Builder, Call, TPIObj, RestoreBB, ResumeBB); + + // Ensure we only set TPIDR2_EL0 to null once. + auto *FirstInst = ResumeBB->getFirstNonPHIOrDbg(); + auto *II = dyn_cast(FirstInst); + if (II && II->getIntrinsicID() == Intrinsic::aarch64_sme_set_tpidr2) + return; + + // Set TPIDR2_EL0 to null before continuing with the rest of the function. + // This will already be null if ZA was restored above, but is necessary + // to abandon the lazy-save if the callee did not commit it. + Builder.SetInsertPoint(FirstInst); Builder.CreateCall(WriteIntr->getFunctionType(), WriteIntr, - Builder.getInt64(0)); + ConstantInt::get(Builder.getInt64Ty(), 0)); + + return; +} + +AllocaInst *SMEABI::createZABufferAndTPIDR2Block( + IRBuilder<> &Builder, Function *F, StructType *TPITy, StringRef ObjName) { + Module *M = F->getParent(); + + Builder.SetInsertPoint(&*F->getEntryBlock().getFirstInsertionPt()); + auto *TPIObj = Builder.CreateAlloca(TPITy, nullptr, ObjName); + + // Allocate a buffer big enough to hold the max ZA size (SVL.B x SVL.B) + Function *NIntr = Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_cntsb); + auto *N = Builder.CreateCall(NIntr->getFunctionType(), NIntr, {}, "N"); + auto *NN = Builder.CreateMul(N, N, "NN"); + auto *Buffer = Builder.CreateAlloca(Builder.getInt8Ty(), NN, "buffer"); + Buffer->setAlignment(Align(16)); + + // Fill the za_save_buffer field of the new TPIDR2 block + auto *TPIBuffer = Builder.CreateGEP( + TPIObj->getAllocatedType(), TPIObj, + {Builder.getInt64(0), Builder.getInt32(0)}, ObjName + ".buffer"); + Builder.CreateStore(Buffer, TPIBuffer); + + // Clear reserved bytes 10-15 of the TPIDR2 block + auto *Reserved = Builder.CreateAlloca(Builder.getIntNTy(48), + Builder.getIntN(48, 0), "zero"); + Reserved->setAlignment(Align(16)); + TPIBuffer = + Builder.CreateGEP(TPIObj->getAllocatedType(), TPIObj, + {Builder.getInt64(0), Builder.getInt32(2)}, "reserved"); + Builder.CreateStore(Reserved, TPIBuffer); + + return TPIObj; +} + +bool SMEABI::handleExceptions(IRBuilder<> &Builder, Function *F, + StructType *TPITy) { + Module *M = F->getParent(); + LLVMContext &Context = F->getContext(); + SmallVector Invokes; + SmallVector Resumes; + SmallVector BeginCatchCalls; + SMEAttrs FnAttrs(*F); + + AllocaInst *TPIObj = nullptr; + + for (BasicBlock &BB : *F) { + for (Instruction &I : BB) { + if (auto *Invoke = dyn_cast(&I)) + Invokes.push_back(Invoke); + else if (auto *Resume = dyn_cast(&I)) + Resumes.push_back(Resume); + else if (auto *Call = dyn_cast(&I)) { + if (Call->getCalledFunction() && + Call->getCalledFunction()->getName() == "__cxa_begin_catch") + BeginCatchCalls.push_back(Call); + } + } + } + + for (InvokeInst *Invoke : Invokes) { + // We only need to set up and restore a lazy-save if there is ZA state. + if (!FnAttrs.hasZAState()) + continue; + + if (!TPIObj) + TPIObj = + createZABufferAndTPIDR2Block(Builder, F, TPITy, "tpidr2.invoke.obj"); + + SMEAttrs InvokeAttrs(*Invoke); + if (InvokeAttrs.hasSharedZAInterface() || InvokeAttrs.preservesZA()) { + // If the Invoke instruction is shared or preserved ZA, setupLazySave + // does not need to restore the lazy-save at the normal destination. + setupLazySave(Builder, Invoke, TPIObj, nullptr, nullptr, nullptr); + } else { + // Otherwise, set up new blocks for restoring ZA if the Invoke was + // successful. Create a new block to read the value of TPIDR2 (CheckBB), + // which becomes the new normal destination for the instruction. + // Create another block to restore ZA (RestoreBB). ResumeBB is the + // original NormalDest for the Invoke. + auto *InvokeBB = Invoke->getParent(); + auto *ResumeBB = Invoke->getNormalDest(); + auto *CheckBB = BasicBlock::Create(Context, "check.za", F, ResumeBB); + Invoke->setNormalDest(CheckBB); + auto *RestoreBB = BasicBlock::Create(Context, "restore.za", F, ResumeBB); + // Update any successor PHI nodes to match the new blocks. + for (PHINode &PN : ResumeBB->phis()) { + PN.replaceIncomingBlockWith(InvokeBB, CheckBB); + PN.addIncoming(PN.getIncomingValueForBlock(CheckBB), RestoreBB); + } + ResumeBB->replaceSuccessorsPhiUsesWith(InvokeBB, ResumeBB); + // Set-up the lazy save for this Invoke. This also handles restoring the + // lazy save for the NormalDest. + setupLazySave(Builder, Invoke, TPIObj, CheckBB, ResumeBB, RestoreBB); + } + + if (!InvokeAttrs.hasNewZAInterface()) { + // New ZA functions are the only function types which will commit + // a lazy-save. For Invokes to any other type of function, create a + // call which saves ZA to ensure that we restore the correct state + // should the callee throw an unhandled exception that unwinds back + // to the caller. + Builder.SetInsertPoint(Invoke); + emitTPIDR2Save(M, Builder, InvokeAttrs.hasSharedZAInterface()); + } + } + + // We may need to restart streaming-mode depending on the starting value + // of pstate.sm on entry to this function (if streaming-compatible). + // Create a call to get this value in the entry block. + Value *PStateOnEntry = ConstantInt::get(Builder.getInt64Ty(), 1); + if (!Invokes.empty() && FnAttrs.hasStreamingCompatibleInterface()) { + Builder.SetInsertPoint(&*F->getEntryBlock().getFirstInsertionPt()); + PStateOnEntry = emitGetPStateSM(M, Builder); + } + + if (TPIObj) { + // Ensure we stop pstate.za before calling the resume instruction. + // TODO: We can improve this by not restoring the lazy-save & restarting + // pstate.za in the first place before blocks which terminate in a resume. + for (ResumeInst *Resume : Resumes) { + Builder.SetInsertPoint(Resume); + Function *DisableZAIntr = + Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_za_disable); + Builder.CreateCall(DisableZAIntr->getFunctionType(), DisableZAIntr); + } + } + + // Restore state at the beginning of each catch block. + for (CallInst *Catch : BeginCatchCalls) { + Catch->addFnAttr(Attribute::get(Context, "aarch64_pstate_za_preserved")); + + // Restore the lazy-save if there is ZA state. + if (FnAttrs.hasZAState()) + restoreLazySave(Builder, Catch, TPIObj, Catch->getParent(), + /*ResumeBB*/ nullptr, PStateOnEntry); + + // Restart streaming-mode in the catch block if necessary. + if (FnAttrs.requiresSMChange(SMEAttrs())) { + auto *FirstInst = Catch->getParent()->getFirstNonPHIOrDbg(); + // Ensure that any catch blocks which are also landing pads keep the + // LandingPadInst as the first instruction in the block. + if (isa(FirstInst)) + FirstInst = FirstInst->getNextNonDebugInstruction(); + Builder.SetInsertPoint(FirstInst); + Function *EnableSMIntr = Intrinsic::getDeclaration( + M, Intrinsic::aarch64_sme_invoke_resume_pstatesm); + Builder.CreateCall(EnableSMIntr->getFunctionType(), EnableSMIntr, + PStateOnEntry); + } + } + + if (TPIObj || !FnAttrs.hasZAState()) { + F->addFnAttr("aarch64_expanded_pstate_za"); + return true; + } + + return false; } /// This function generates code to commit a lazy save at the beginning of a @@ -83,8 +406,8 @@ /// is active and we should call __arm_tpidr2_save to commit the lazy save. /// Additionally, PSTATE.ZA should be enabled at the beginning of the function /// and disabled before returning. -bool SMEABI::updateNewZAFunctions(Module *M, Function *F, - IRBuilder<> &Builder) { +bool SMEABI::updateNewZAFunctions(Module *M, Function *F, IRBuilder<> &Builder, + bool ClearZTState) { LLVMContext &Context = F->getContext(); BasicBlock *OrigBB = &F->getEntryBlock(); @@ -104,7 +427,7 @@ // Create a call __arm_tpidr2_save, which commits the lazy save. Builder.SetInsertPoint(&SaveBB->back()); - emitTPIDR2Save(M, Builder); + emitTPIDR2Save(M, Builder, /* Clear TPIDR2 */ true); // Enable pstate.za at the start of the function. Builder.SetInsertPoint(&OrigBB->front()); @@ -112,6 +435,14 @@ Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_za_enable); Builder.CreateCall(EnableZAIntr->getFunctionType(), EnableZAIntr); + // If SME2, clear ZT0 on entry to the function, after enabling pstate.za + if (ClearZTState) { + Function *ClearZT0Intr = + Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_zero_zt); + Builder.CreateCall(ClearZT0Intr->getFunctionType(), ClearZT0Intr, + {Builder.getInt32(0)}); + } + // Before returning, disable pstate.za for (BasicBlock &BB : *F) { Instruction *T = BB.getTerminator(); @@ -132,13 +463,27 @@ LLVMContext &Context = F.getContext(); IRBuilder<> Builder(Context); + TargetPassConfig &TPC = getAnalysis(); + + StructType *TPITy = + StructType::get(Context, {Builder.getInt8PtrTy(), Builder.getInt16Ty(), + ArrayType::get(Builder.getInt8Ty(), 6)}); + if (F.isDeclaration() || F.hasFnAttribute("aarch64_expanded_pstate_za")) return false; bool Changed = false; - SMEAttrs FnAttrs(F); + + const AArch64Subtarget *Subtarget = + TPC.getTM().getSubtargetImpl(F); + + SMEAttrs FnAttrs(F, Subtarget->hasSME2()); if (FnAttrs.hasNewZAInterface()) - Changed |= updateNewZAFunctions(M, &F, Builder); + Changed |= updateNewZAFunctions(M, &F, Builder, + FnAttrs.requiresPreservingZT(SMEAttrs())); + + if (FnAttrs.hasZAState() || FnAttrs.requiresSMChange(SMEAttrs())) + Changed |= handleExceptions(Builder, &F, TPITy); return Changed; } diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -10,11 +10,13 @@ // //===----------------------------------------------------------------------===// -def imm_to_tile8 : ComplexPattern", []>; -def imm_to_tile16 : ComplexPattern", []>; -def imm_to_tile32 : ComplexPattern", []>; -def imm_to_tile64 : ComplexPattern", []>; -def imm_to_tile128 : ComplexPattern", []>; +def imm_to_tile8 : ComplexPattern", []>; +def imm_to_tile16 : ComplexPattern", []>; +def imm_to_tile32 : ComplexPattern", []>; +def imm_to_tile64 : ComplexPattern", []>; +def imm_to_tile128 : ComplexPattern", []>; + +def imm_to_zt : ComplexPattern", []>; def tileslice8 : ComplexPattern", []>; def tileslice16 : ComplexPattern", []>; @@ -757,6 +759,15 @@ def : InstAlias(NAME) MatrixOp:$ZAt, MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm4, GPR64sp:$Rn, 0), 1>; + def NAME # _PSEUDO + : Pseudo<(outs), + (ins MatrixIndexGPR32Op12_15:$idx, imm0_15:$imm4, + GPR64sp:$base), []>, + Sched<[]> { + // Translated to actual instruction in AArch64ISelLowering.cpp + let usesCustomInserter = 1; + let mayStore = 1; + } // base def : Pat<(int_aarch64_sme_str MatrixIndexGPR32Op12_15:$idx, GPR64sp:$base), (!cast(NAME) ZA, $idx, 0, $base, 0)>; @@ -1218,8 +1229,9 @@ def : SVE_1_Op_Passthru_Pat(NAME)>; } +// FIXME: Why is there a tied operand here? There is no predication. class sve2_clamp sz, bit U, ZPRRegOp zpr_ty> - : I<(outs zpr_ty:$Zd), (ins zpr_ty:$Zn, zpr_ty:$Zm, zpr_ty:$_Zd), + : I<(outs zpr_ty:$Zd), (ins zpr_ty:$_Zd, zpr_ty:$Zn, zpr_ty:$Zm), asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> { bits<5> Zm; @@ -1309,30 +1321,30 @@ (!cast(NAME # _D) PNRAny:$Pd, PNRAny:$Pn, PPR64:$Pm, MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_1:$imm), 0>; - def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv16i1 PPRAny:$Pm), + def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv16i1 PPR8:$Pm), MatrixIndexGPR32Op12_15:$idx)), (!cast(NAME # _B) $Pn, $Pm, $idx, 0)>; - def : Pat<(nxv8i1 (op (nxv8i1 PPRAny:$Pn), (nxv8i1 PPRAny:$Pm), + def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv8i1 PPR16:$Pm), MatrixIndexGPR32Op12_15:$idx)), (!cast(NAME # _H) $Pn, $Pm, $idx, 0)>; - def : Pat<(nxv4i1 (op (nxv4i1 PPRAny:$Pn), (nxv4i1 PPRAny:$Pm), + def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv4i1 PPR32:$Pm), MatrixIndexGPR32Op12_15:$idx)), (!cast(NAME # _S) $Pn, $Pm, $idx, 0)>; - def : Pat<(nxv2i1 (op (nxv2i1 PPRAny:$Pn), (nxv2i1 PPRAny:$Pm), + def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv2i1 PPR64:$Pm), MatrixIndexGPR32Op12_15:$idx)), (!cast(NAME # _D) $Pn, $Pm, $idx, 0)>; let AddedComplexity = 1 in { - def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv16i1 PPRAny:$Pm), + def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv16i1 PPR8:$Pm), (i32 (tileslice8 MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_15:$imm)))), (!cast(NAME # _B) $Pn, $Pm, $idx, $imm)>; - def : Pat<(nxv8i1 (op (nxv8i1 PPRAny:$Pn), (nxv8i1 PPRAny:$Pm), + def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv8i1 PPR16:$Pm), (i32 (tileslice16 MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_7:$imm)))), (!cast(NAME # _H) $Pn, $Pm, $idx, $imm)>; - def : Pat<(nxv4i1 (op (nxv4i1 PPRAny:$Pn), (nxv4i1 PPRAny:$Pm), + def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv4i1 PPR32:$Pm), (i32 (tileslice32 MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_3:$imm)))), (!cast(NAME # _S) $Pn, $Pm, $idx, $imm)>; - def : Pat<(nxv2i1 (op (nxv2i1 PPRAny:$Pn), (nxv2i1 PPRAny:$Pm), + def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv2i1 PPR64:$Pm), (i32 (tileslice64 MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_1:$imm)))), (!cast(NAME # _D) $Pn, $Pm, $idx, $imm)>; } @@ -2926,6 +2938,18 @@ let Inst{3-0} = opc; } +multiclass sme2_zero_zt opc> { + def NAME : sme2_zero_zt; + + def NAME # _PSEUDO + : Pseudo<(outs), (ins ZTR:$ZT), []>, Sched<[]> { + // Translated to actual instruction in AArch64ISelLowering.cpp + let usesCustomInserter = 1; + } + def : Pat<(int_aarch64_sme_zero_zt (imm_to_zt untyped:$zt)), + (!cast(NAME # _PSEUDO) $zt)>; +} + //===----------------------------------------------------------------------===// // SME2 lookup table load/store class sme2_spill_fill_vector opc> @@ -2945,10 +2969,23 @@ let mayStore = opc{7}; } +multiclass sme2_spill_fill_vector opc, SDPatternOperator op> { + def NAME : sme2_spill_fill_vector; + + def NAME # _PSEUDO + : Pseudo<(outs), (ins MatrixOp:$ZTt, GPR64sp:$base), []>, Sched<[]> { + // Translated to actual instruction in AArch64ISelLowering.cpp + let usesCustomInserter = 1; + } + + def : Pat<(op (imm_to_zt untyped:$tile), GPR64sp:$base), + (!cast(NAME # _PSEUDO) $tile, $base)>; +} + //===----------------------------------------------------------------------===/// // SME2 move to/from lookup table class sme2_movt_zt_to_scalar opc> - : I<(outs GPR64:$Rt), (ins ZTR:$ZTt, uimm3s8:$imm3), + : I<(outs GPR64:$Rt), (ins ZTR:$ZTt, tuimm3s8_32:$imm3), mnemonic, "\t$Rt, $ZTt$imm3", "", []>, Sched<[]> { bits<3> imm3; @@ -2960,7 +2997,7 @@ } class sme2_movt_scalar_to_zt opc> - : I<(outs ZTR:$ZTt), (ins uimm3s8:$imm3, GPR64:$Rt), + : I<(outs ZTR:$ZTt), (ins tuimm3s8_32:$imm3, GPR64:$Rt), mnemonic, "\t$ZTt$imm3, $Rt", "", []>, Sched<[]> { bits<3> imm3; @@ -2991,28 +3028,46 @@ class sme2_luti2_vector_index sz, RegisterOperand vector_ty, string mnemonic> - : sme2_luti_vector_index { + : sme2_luti_vector_index { bits<4> i; let Inst{17-14} = i; } -multiclass sme2_luti2_vector_index { +multiclass sme2_luti2_vector_index { def _B : sme2_luti2_vector_index<0b00, ZPR8, mnemonic>; def _H : sme2_luti2_vector_index<0b01, ZPR16, mnemonic>; def _S : sme2_luti2_vector_index<0b10, ZPR32, mnemonic>; + + def : Pat<(nxv16i8 (intrinsic (imm_to_zt untyped:$zt), nxv16i8:$zn, (i32 VectorIndexB32b_timm:$imm))), + (!cast(NAME # _B) $zt, nxv16i8:$zn, (i32 VectorIndexB32b_timm:$imm))>; + + def : Pat<(nxv8i16 (intrinsic (imm_to_zt untyped:$zt), nxv8i16:$zn, (i32 VectorIndexB32b_timm:$imm))), + (!cast(NAME # _H) $zt, nxv8i16:$zn, (i32 VectorIndexB32b_timm:$imm))>; + + def : Pat<(nxv4i32 (intrinsic (imm_to_zt untyped:$zt), nxv4i32:$zn, (i32 VectorIndexB32b_timm:$imm))), + (!cast(NAME # _S) $zt, nxv4i32:$zn, (i32 VectorIndexB32b_timm:$imm))>; } class sme2_luti4_vector_index sz, RegisterOperand vector_ty, string mnemonic> - : sme2_luti_vector_index { + : sme2_luti_vector_index { bits<3> i; let Inst{16-14} = i; } -multiclass sme2_luti4_vector_index { +multiclass sme2_luti4_vector_index { def _B : sme2_luti4_vector_index<0b00, ZPR8, mnemonic>; def _H : sme2_luti4_vector_index<0b01, ZPR16, mnemonic>; def _S : sme2_luti4_vector_index<0b10, ZPR32, mnemonic>; + + def : Pat<(nxv16i8 (intrinsic (imm_to_zt untyped:$zt), nxv16i8:$zn, (i32 VectorIndexH32b_timm:$imm))), + (!cast(NAME # _B) $zt, nxv16i8:$zn, (i32 VectorIndexH32b_timm:$imm))>; + + def : Pat<(nxv8i16 (intrinsic (imm_to_zt untyped:$zt), nxv8i16:$zn, (i32 VectorIndexH32b_timm:$imm))), + (!cast(NAME # _H) $zt, nxv8i16:$zn, (i32 VectorIndexH32b_timm:$imm))>; + + def : Pat<(nxv4i32 (intrinsic (imm_to_zt untyped:$zt), nxv4i32:$zn, (i32 VectorIndexH32b_timm:$imm))), + (!cast(NAME # _S) $zt, nxv4i32:$zn, (i32 VectorIndexH32b_timm:$imm))>; } // SME2 lookup table expand two contiguous registers diff --git a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp @@ -0,0 +1,181 @@ +//===- SMEPeepholeOpt.cpp - SME peephole optimization pass ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" +#include "Utils/AArch64SMEAttributes.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunctionPass.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-sme-peephole-opt" + +namespace { + +struct SMEPeepholeOpt : public MachineFunctionPass { + static char ID; + + SMEPeepholeOpt() : MachineFunctionPass(ID) { + initializeSMEPeepholeOptPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { + return "SME Peephole Optimization pass"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool optimizeStartStopPairs(MachineBasicBlock &MBB) const; +}; + +char SMEPeepholeOpt::ID = 0; + +} // end anonymous namespace + +static bool isConditionalStartStop(const MachineInstr *MI) { + return MI->getOpcode() == AArch64::MSRpstatePseudo; +} + +static bool isMatchingStartStopPair(const MachineInstr *MI1, const MachineInstr *MI2) { + // We only consider the same type of streaming mode change here, i.e. + // start/stop SM, or start/stop ZA pairs. + if (MI1->getOperand(0).getImm() != MI2->getOperand(0).getImm()) + return false; + + bool IsConditional = isConditionalStartStop(MI2); + if (isConditionalStartStop(MI1) != IsConditional) + return false; + + if (!IsConditional) + return true; + + // This optimisation is unlikely to happen in practice for conditional + // smstart/smstop pairs as the virtual registers for pstate.sm will always + // be different. + // TODO: For this optimisation to apply to conditional smstart/smstop, + // this pass will need to do more work to remove redundant calls to + // __arm_sme_state. + + // Only consider conditional start/stop pairs which read the same register + // holding the original value of pstate.sm, as some conditional start/stops + // require the state on entry to the function. + Register Reg1 = MI1->getOperand(2).getReg(); + Register Reg2 = MI2->getOperand(2).getReg(); + if (Reg1.isPhysical() || Reg2.isPhysical() || Reg1 != Reg2) + return false; + + // Ensure reg masks are identical. + if (MI1->getOperand(4).getRegMask() != MI2->getOperand(4).getRegMask()) + return false; + + // Check to make sure the conditional start/stop pairs are identical. + return MI1->getOperand(3).getImm() == MI2->getOperand(3).getImm(); +} + +bool SMEPeepholeOpt::optimizeStartStopPairs(MachineBasicBlock &MBB) const { + SmallVector ToBeRemoved; + MachineInstr *Prev = nullptr; + bool PrevIsStart = false; + + for (MachineInstr &MI : make_early_inc_range(MBB)) { + // Walk through instructions in the block trying to find pairs of smstart + // and smstop nodes that cancel each other out. We only permit a limited + // set of instructions to appear between them, otherwise we reset our + // tracking. + switch (MI.getOpcode()) { + default: + Prev = nullptr; + break; + case AArch64::BL: { + // Permits calls to __arm_sme_state. + if (!MI.getOperand(0).isSymbol() || + strcmp(MI.getOperand(0).getSymbolName(), "__arm_sme_state")) + Prev = nullptr; + break; + } + case AArch64::COPY: { + // Permit copies of 32 and 64-bit registers. + if (!MI.getOperand(1).isReg()) { + Prev = nullptr; + break; + } + Register Reg = MI.getOperand(1).getReg(); + if (!AArch64::GPR32RegClass.contains(Reg) && + !AArch64::GPR64RegClass.contains(Reg)) + Prev = nullptr; + break; + } + case AArch64::ADJCALLSTACKDOWN: + case AArch64::ADJCALLSTACKUP: + case AArch64::ANDXri: + // We permit these as they don't generate SVE/NEON instructions. + break; + case AArch64::MSRpstatesvcrImm1: + case AArch64::MSRpstatePseudo: { + // Pairs of smstart/smstop nodes must either both be unconditional or + // both be conditional. + if (Prev && !isMatchingStartStopPair(Prev, &MI)) { + Prev = nullptr; + break; + } + + assert((MI.getOperand(1).getImm() < 2) && "Invalid SM state"); + bool CurIsStart = (MI.getOperand(1).getImm() != 0); + if (Prev && CurIsStart != PrevIsStart) { + ToBeRemoved.push_back(Prev); + ToBeRemoved.push_back(&MI); + } + + if (Prev) + Prev = nullptr; + else { + Prev = &MI; + PrevIsStart = CurIsStart; + } + break; + } + } + } + + for (MachineInstr *MI : ToBeRemoved) + MI->eraseFromParent(); + + return ToBeRemoved.size(); +} + +INITIALIZE_PASS(SMEPeepholeOpt, "aarch64-sme-peephole-opt", + "SME Peephole Optimization", false, false) + +bool SMEPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + if (!MF.getSubtarget().hasSME()) + return false; + + assert(MF.getRegInfo().isSSA() && "Expected to be run on SSA form!"); + + bool Changed = false; + + // Even if the block lives in a function with no SME attributes attached we + // still have to analyze all the blocks because we may call a streaming + // function that requires smstart/smstop pairs. + for (MachineBasicBlock &MBB : MF) { + Changed |= optimizeStartStopPairs(MBB); + } + + return Changed; +} + +FunctionPass *llvm::createSMEPeepholeOptPass() { return new SMEPeepholeOpt(); } diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -506,6 +506,18 @@ : Pat<(vtd (op vt1:$Op1, (vt2 ImmTy:$Op2))), (inst $Op1, ImmTy:$Op2)>; +multiclass SVE2p1_Cntp_Pat { + def : Pat<(vtd (op vt1:$Op1, (i32 2))), (inst $Op1, 0)>; + def : Pat<(vtd (op vt1:$Op1, (i32 4))), (inst $Op1, 1)>; +} + +multiclass SVE2p1_While_PN_Pat { + def : Pat<(vtd (op vt1:$Op1, vt1:$Op2, (i32 2))), (inst $Op1, $Op2, 0)>; + def : Pat<(vtd (op vt1:$Op1, vt1:$Op2, (i32 4))), (inst $Op1, $Op2, 1)>; +} + class SVE_3_Op_Imm_Pat @@ -2051,6 +2063,13 @@ let mayRaiseFPException = 1; } +multiclass sve2p1_bf_2op_p_zds sz, bits<4> opc, string asm, + ZPRRegOp zprty, SDPatternOperator ir_intrinsic> { + def NAME : sve_fp_2op_p_zds; + def : Pat <(nxv8bf16 (ir_intrinsic nxv8i1:$pg, nxv8bf16:$Op1, nxv8bf16:$Op2)), + (!cast(NAME) $pg, $Op1, $Op2)>; +} + multiclass sve_fp_2op_p_zds opc, string asm, string Ps, SDPatternOperator op, DestructiveInstTypeEnum flags, string revname="", bit isReverseInstr=0> { @@ -2386,7 +2405,7 @@ let mayRaiseFPException = 1; } -multiclass sve2p1_fp_bfmul_by_indexed_elem { +multiclass sve2p1_fp_bfmul_by_indexed_elem { def NAME : sve_fp_fmul_by_indexed_elem<{0, ?}, 0b1, asm, ZPR16, ZPR3b16, VectorIndexH32b> { bits<3> Zm; bits<3> iop; @@ -2394,6 +2413,9 @@ let Inst{20-19} = iop{1-0}; let Inst{18-16} = Zm; } + + def : Pat <(nxv8bf16 (ir_intrinsic nxv8bf16:$Op1, nxv8bf16:$Op2, (i32 VectorIndexH32b_timm:$idx))), + (!cast(NAME) $Op1, $Op2, VectorIndexH32b_timm:$idx)>; } multiclass sve_fp_fmul_by_indexed_elem { @@ -8723,6 +8745,47 @@ def : SVE_3_Op_Pat(NAME)>; } +class sve_bfloat_matmul_longvecl +: sve_bfloat_matmul { + let Inst{23} = 0b1; + let Inst{14} = 0b0; + let Inst{13} = sub; + let Inst{10} = BT; +} + +multiclass sve_bfloat_matmul_longvecl { + def NAME : sve_bfloat_matmul_longvecl; + def : SVE_3_Op_Pat(NAME)>; +} + +class sve_bfloat_matmul_longvecl_idx +: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm, VectorIndexH32b:$iop), + asm, "\t$Zda, $Zn, $Zm$iop", "", []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + bits<3> Zm; + bits<3> iop; + let Inst{31-21} = 0b01100100111; + let Inst{20-19} = iop{2-1}; + let Inst{18-16} = Zm; + let Inst{15-14} = 0b01; + let Inst{13} = sub; + let Inst{12} = 0b0; + let Inst{11} = iop{0}; + let Inst{10} = BT; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = DestructiveOther; + let ElementSize = ElementSizeH; +} + +multiclass sve_bfloat_matmul_longvecl_idx { + def NAME : sve_bfloat_matmul_longvecl_idx; + def : SVE_4_Op_Imm_Pat(NAME)>; +} + class sve_bfloat_convert : I<(outs ZPR16:$Zd), (ins ZPR16:$_Zd, PPR3bAny:$Pg, ZPR32:$Zn), asm, "\t$Zd, $Pg/m, $Zn", "", []>, Sched<[]> { @@ -9158,9 +9221,9 @@ def : SVE_4_Op_Imm_Pat(NAME)>; } -class sve2p1_ptrue_pn sz, PNRP8to15RegOp pnrty> +class sve2p1_ptrue_pn sz, PNRP8to15RegOp pnrty, SDPatternOperator op> : I<(outs pnrty:$PNd), (ins ), mnemonic, "\t$PNd", - "", []>, Sched<[]> { + "", [(set pnrty:$PNd, (op))]>, Sched<[]> { bits<3> PNd; let Inst{31-24} = 0b00100101; let Inst{23-22} = sz; @@ -9172,10 +9235,10 @@ multiclass sve2p1_ptrue_pn { - def _B : sve2p1_ptrue_pn; - def _H : sve2p1_ptrue_pn; - def _S : sve2p1_ptrue_pn; - def _D : sve2p1_ptrue_pn; + def _B : sve2p1_ptrue_pn; + def _H : sve2p1_ptrue_pn; + def _S : sve2p1_ptrue_pn; + def _D : sve2p1_ptrue_pn; } @@ -9200,16 +9263,21 @@ } class sve2p1_pred_as_ctr_to_mask sz, PPRRegOp pprty> - : sve2p1_pred_as_ctr_to_mask_base { + : sve2p1_pred_as_ctr_to_mask_base { bits<2> index; let Inst{9-8} = index; } -multiclass sve2p1_pred_as_ctr_to_mask { +multiclass sve2p1_pred_as_ctr_to_mask { def _B : sve2p1_pred_as_ctr_to_mask; def _H : sve2p1_pred_as_ctr_to_mask; def _S : sve2p1_pred_as_ctr_to_mask; def _D : sve2p1_pred_as_ctr_to_mask; + + def : SVE_2_Op_Imm_Pat(NAME # _B)>; + def : SVE_2_Op_Imm_Pat(NAME # _H)>; + def : SVE_2_Op_Imm_Pat(NAME # _S)>; + def : SVE_2_Op_Imm_Pat(NAME # _D)>; } @@ -9543,6 +9611,11 @@ def _H : sve2p1_pcount_pn; def _S : sve2p1_pcount_pn; def _D : sve2p1_pcount_pn; + + defm : SVE2p1_Cntp_Pat(NAME # _B)>; + defm : SVE2p1_Cntp_Pat(NAME # _H)>; + defm : SVE2p1_Cntp_Pat(NAME # _S)>; + defm : SVE2p1_Cntp_Pat(NAME # _D)>; } @@ -9579,6 +9652,15 @@ def _H : sve2p1_int_while_rr_pn; def _S : sve2p1_int_while_rr_pn; def _D : sve2p1_int_while_rr_pn; + + defm : SVE2p1_While_PN_Pat("int_aarch64_sve_" # mnemonic # "_c8"), + i64, !cast(NAME # _B)>; + defm : SVE2p1_While_PN_Pat("int_aarch64_sve_" # mnemonic # "_c16"), + i64, !cast(NAME # _H)>; + defm : SVE2p1_While_PN_Pat("int_aarch64_sve_" # mnemonic # "_c32"), + i64, !cast(NAME # _S)>; + defm : SVE2p1_While_PN_Pat("int_aarch64_sve_" # mnemonic # "_c64"), + i64, !cast(NAME # _D)>; } diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h --- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h +++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h @@ -23,6 +23,8 @@ /// a call). class SMEAttrs { unsigned Bitmask; + bool HasSME2; + bool NoReturn; public: // Enum with bitmasks for each individual SME feature. @@ -37,13 +39,23 @@ All = ZA_Preserved - 1 }; - SMEAttrs(unsigned Mask = Normal) : Bitmask(0) { set(Mask); } - SMEAttrs(const Function &F) : SMEAttrs(F.getAttributes()) {} + SMEAttrs(unsigned Mask = Normal, bool SME2 = false) + : Bitmask(0), HasSME2(false), NoReturn(false) { + set(Mask); + HasSME2 = SME2; + NoReturn = false; + } + SMEAttrs(const Function &F, bool SME2 = false) : SMEAttrs(F.getAttributes()) { + HasSME2 = SME2; + NoReturn = F.doesNotReturn(); + } SMEAttrs(const CallBase &CB); SMEAttrs(const AttributeList &L); void set(unsigned M, bool Enable = true); + bool hasNoReturn() const { return NoReturn; } + // Interfaces to query PSTATE.SM bool hasStreamingBody() const { return Bitmask & SM_Body; } bool hasStreamingInterface() const { return Bitmask & SM_Enabled; } @@ -71,6 +83,15 @@ requiresSMChange(const SMEAttrs &Callee, bool BodyOverridesInterface = false) const; + /// \return true if a call from Caller -> Callee requires ZT0 state to be + /// preserved. + /// ZT0 must be preserved if the caller has ZT state (SME2 is enabled and + /// the caller has ZA state) and the callee does not preserve ZT (SME2 is + /// enabled and the callee does not preserve ZA). + bool requiresPreservingZT(const SMEAttrs &Callee) const { + return HasSME2 && hasZTState() && !Callee.preservesZA(); + } + // Interfaces to query PSTATE.ZA bool hasNewZAInterface() const { return Bitmask & ZA_New; } bool hasSharedZAInterface() const { return Bitmask & ZA_Shared; } @@ -81,8 +102,10 @@ } bool requiresLazySave(const SMEAttrs &Callee) const { return hasZAState() && Callee.hasPrivateZAInterface() && - !Callee.preservesZA(); + !Callee.preservesZA() && !Callee.hasNoReturn(); } + + bool hasZTState() const { return hasZAState(); } }; } // namespace llvm diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp --- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp +++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp @@ -28,12 +28,16 @@ SMEAttrs::SMEAttrs(const CallBase &CB) { *this = SMEAttrs(CB.getAttributes()); - if (auto *F = CB.getCalledFunction()) - set(SMEAttrs(*F).Bitmask); + if (auto *F = CB.getCalledFunction()) { + SMEAttrs A(*F); + set(A.Bitmask); + NoReturn |= A.NoReturn; + } } SMEAttrs::SMEAttrs(const AttributeList &Attrs) { Bitmask = 0; + NoReturn = false; if (Attrs.hasFnAttr("aarch64_pstate_sm_enabled")) Bitmask |= SM_Enabled; if (Attrs.hasFnAttr("aarch64_pstate_sm_compatible")) @@ -46,6 +50,8 @@ Bitmask |= ZA_New; if (Attrs.hasFnAttr("aarch64_pstate_za_preserved")) Bitmask |= ZA_Preserved; + if (Attrs.hasFnAttr(Attribute::NoReturn)) + NoReturn = true; } std::optional diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll --- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -117,6 +117,7 @@ ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: AArch64 Local Dynamic TLS Access Clean-up ; CHECK-NEXT: Finalize ISel and expand pseudo-instructions +; CHECK-NEXT: SME Peephole Optimization pass ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Early Tail Duplication ; CHECK-NEXT: Optimize machine instruction PHIs diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll --- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll +++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll @@ -236,6 +236,8 @@ ; CHECK-COMMON-NEXT: msub x8, x8, x8, x9 ; CHECK-COMMON-NEXT: mov sp, x8 ; CHECK-COMMON-NEXT: stur x8, [x29, #-16] +; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] +; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] ; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-COMMON-NEXT: cbz x8, .LBB6_2 ; CHECK-COMMON-NEXT: b .LBB6_1 @@ -266,11 +268,12 @@ ; CHECK-COMMON-NEXT: mov x29, sp ; CHECK-COMMON-NEXT: sub sp, sp, #16 ; CHECK-COMMON-NEXT: rdsvl x8, #1 -; CHECK-COMMON-NEXT: mul x8, x8, x8 ; CHECK-COMMON-NEXT: mov x9, sp -; CHECK-COMMON-NEXT: subs x9, x9, x8 +; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 ; CHECK-COMMON-NEXT: mov sp, x9 ; CHECK-COMMON-NEXT: stur x9, [x29, #-16] +; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] +; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] ; CHECK-COMMON-NEXT: sturh w8, [x29, #-8] ; CHECK-COMMON-NEXT: sub x8, x29, #16 ; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x8 @@ -304,14 +307,15 @@ ; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ; CHECK-COMMON-NEXT: mov x29, sp ; CHECK-COMMON-NEXT: sub sp, sp, #16 -; CHECK-COMMON-NEXT: rdsvl x8, #1 -; CHECK-COMMON-NEXT: mov x9, sp -; CHECK-COMMON-NEXT: mul x8, x8, x8 -; CHECK-COMMON-NEXT: sub x9, x9, x8 -; CHECK-COMMON-NEXT: mov sp, x9 +; CHECK-COMMON-NEXT: mov x8, sp +; CHECK-COMMON-NEXT: rdsvl x9, #1 +; CHECK-COMMON-NEXT: msub x8, x9, x9, x8 +; CHECK-COMMON-NEXT: mov sp, x8 ; CHECK-COMMON-NEXT: sub x10, x29, #16 -; CHECK-COMMON-NEXT: stur x9, [x29, #-16] -; CHECK-COMMON-NEXT: sturh w8, [x29, #-8] +; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] +; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] +; CHECK-COMMON-NEXT: stur x8, [x29, #-16] +; CHECK-COMMON-NEXT: sturh w9, [x29, #-8] ; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10 ; CHECK-COMMON-NEXT: bl __addtf3 ; CHECK-COMMON-NEXT: smstart za diff --git a/llvm/test/CodeGen/AArch64/sme-exceptions-with-streaming-mode-llc.ll b/llvm/test/CodeGen/AArch64/sme-exceptions-with-streaming-mode-llc.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-exceptions-with-streaming-mode-llc.ll @@ -0,0 +1,123 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --include-generated-funcs +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs -o - %S/sme-exceptions-with-streaming-mode.ll | FileCheck %s + +; CHECK-LABEL: no_za_streaming_enabled: +; CHECK: .Lfunc_begin0: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: .cfi_offset b8, -24 +; CHECK-NEXT: .cfi_offset b9, -32 +; CHECK-NEXT: .cfi_offset b10, -40 +; CHECK-NEXT: .cfi_offset b11, -48 +; CHECK-NEXT: .cfi_offset b12, -56 +; CHECK-NEXT: .cfi_offset b13, -64 +; CHECK-NEXT: .cfi_offset b14, -72 +; CHECK-NEXT: .cfi_offset b15, -80 +; CHECK-NEXT: .Ltmp0: +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl normal_callee +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .Ltmp1: +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: mov w0, #15 +; CHECK-NEXT: .LBB0_2: // %return +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_3: // %lpad +; CHECK-NEXT: .Ltmp2: +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: smstart sm +; CHECK-NEXT: mov w0, #23 +; CHECK-NEXT: b .LBB0_2 +; +; CHECK-LABEL: no_za_streaming_compatible: +; CHECK: .Lfunc_begin1: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -32 +; CHECK-NEXT: .cfi_offset b8, -40 +; CHECK-NEXT: .cfi_offset b9, -48 +; CHECK-NEXT: .cfi_offset b10, -56 +; CHECK-NEXT: .cfi_offset b11, -64 +; CHECK-NEXT: .cfi_offset b12, -72 +; CHECK-NEXT: .cfi_offset b13, -80 +; CHECK-NEXT: .cfi_offset b14, -88 +; CHECK-NEXT: .cfi_offset b15, -96 +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: .Ltmp3: +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: and x20, x0, #0x1 +; CHECK-NEXT: tbz x20, #0, .LBB1_2 +; CHECK-NEXT: // %bb.1: // %entry +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB1_2: // %entry +; CHECK-NEXT: bl normal_callee +; CHECK-NEXT: tbz x20, #0, .LBB1_4 +; CHECK-NEXT: // %bb.3: // %entry +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB1_4: // %entry +; CHECK-NEXT: .Ltmp4: +; CHECK-NEXT: // %bb.5: +; CHECK-NEXT: mov w0, #15 +; CHECK-NEXT: .LBB1_6: // %return +; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB1_7: // %lpad +; CHECK-NEXT: .Ltmp5: +; CHECK-NEXT: mov x2, x0 +; CHECK-NEXT: tbz x19, #0, .LBB1_9 +; CHECK-NEXT: // %bb.8: // %lpad +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB1_9: // %lpad +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: tbz x19, #0, .LBB1_11 +; CHECK-NEXT: // %bb.10: // %lpad +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB1_11: // %lpad +; CHECK-NEXT: mov x0, x2 +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: tbz x19, #0, .LBB1_13 +; CHECK-NEXT: // %bb.12: // %lpad +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB1_13: // %lpad +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: tbz x19, #0, .LBB1_15 +; CHECK-NEXT: // %bb.14: // %lpad +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB1_15: // %lpad +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: tbz x19, #0, .LBB1_17 +; CHECK-NEXT: // %bb.16: // %lpad +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB1_17: // %lpad +; CHECK-NEXT: mov w0, #23 +; CHECK-NEXT: b .LBB1_6 diff --git a/llvm/test/CodeGen/AArch64/sme-exceptions-with-streaming-mode.ll b/llvm/test/CodeGen/AArch64/sme-exceptions-with-streaming-mode.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-exceptions-with-streaming-mode.ll @@ -0,0 +1,88 @@ +; RUN: opt -S -mtriple=aarch64-linux-gnu -aarch64-sme-abi %s | FileCheck %s +; RUN: opt -S -mtriple=aarch64-linux-gnu -aarch64-sme-abi -aarch64-sme-abi %s | FileCheck %s + +declare i32 @normal_callee() + +; Simple try-catch with no ZA state. No lazy-save is required, but we must restart pstate.sm in the +; exception handler. + +define i32 @no_za_streaming_enabled() "aarch64_pstate_sm_enabled" personality i32 1 { +; CHECK-LABEL: define {{[^@]+}}@no_za_streaming_enabled() #0 personality i32 1 { +; CHECK-NEXT: entry: +; CHECK-NEXT: invoke void @normal_callee() +; CHECK-NEXT: to label [[RETURN:%.*]] unwind label [[LPAD:%.*]] +; CHECK: lpad: +; CHECK-NEXT: [[TMP0:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: call void @llvm.aarch64.sme.invoke.resume.pstatesm(i64 1) +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP1]]) +; CHECK-NEXT: tail call void @__cxa_end_catch() +; CHECK-NEXT: br label [[RETURN]] +; CHECK: return: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i32 [ 23, [[LPAD]] ], [ 15, [[ENTRY:%.*]] ] +; CHECK-NEXT: ret i32 [[RETVAL]] +; +entry: + invoke void @normal_callee() to label %return unwind label %lpad + +lpad: + %0 = landingpad { ptr, i32 } + catch ptr null + %1 = extractvalue { ptr, i32 } %0, 0 + %2 = tail call ptr @__cxa_begin_catch(ptr %1) + tail call void @__cxa_end_catch() + br label %return + +return: + %retval = phi i32 [ 23, %lpad ], [ 15, %entry ] + ret i32 %retval +} + +; As above, but the function is streaming_compatible + +define i32 @no_za_streaming_compatible() "aarch64_pstate_sm_compatible" personality i32 1 { +; CHECK-LABEL: define {{[^@]+}}@no_za_streaming_compatible() #1 personality i32 1 { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call aarch64_sme_preservemost_from_x2 [[TMP0]] @__arm_sme_state() +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue [[TMP0]] [[TMP0]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], 1 +; CHECK-NEXT: invoke void @normal_callee() +; CHECK-NEXT: to label [[RETURN:%.*]] unwind label [[LPAD:%.*]] +; CHECK: lpad: +; CHECK-NEXT: [[TMP3:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: call void @llvm.aarch64.sme.invoke.resume.pstatesm(i64 [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { ptr, i32 } [[TMP3]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP4]]) +; CHECK-NEXT: tail call void @__cxa_end_catch() +; CHECK-NEXT: br label [[RETURN]] +; CHECK: return: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i32 [ 23, [[LPAD]] ], [ 15, [[ENTRY:%.*]] ] +; CHECK-NEXT: ret i32 [[RETVAL]] +; +entry: + invoke void @normal_callee() to label %return unwind label %lpad + +lpad: + %0 = landingpad { ptr, i32 } + catch ptr null + %1 = extractvalue { ptr, i32 } %0, 0 + %2 = tail call ptr @__cxa_begin_catch(ptr %1) + tail call void @__cxa_end_catch() + br label %return + +return: + %retval = phi i32 [ 23, %lpad ], [ 15, %entry ] + ret i32 %retval +} + +declare ptr @__cxa_begin_catch(ptr) +declare void @__cxa_end_catch() + +; CHECK: declare %0 @__arm_sme_state() #3 + +; CHECK: attributes #0 = { "aarch64_expanded_pstate_za" "aarch64_pstate_sm_enabled" } +; CHECK: attributes #1 = { "aarch64_expanded_pstate_za" "aarch64_pstate_sm_compatible" } +; CHECK: attributes #2 = { nocallback nofree nosync nounwind willreturn } +; CHECK: attributes #3 = { "aarch64_pstate_sm_compatible" "aarch64_pstate_za_preserved" } diff --git a/llvm/test/CodeGen/AArch64/sme-exceptions-with-za-state-llc.ll b/llvm/test/CodeGen/AArch64/sme-exceptions-with-za-state-llc.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-exceptions-with-za-state-llc.ll @@ -0,0 +1,1392 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --include-generated-funcs +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs -o - %S/sme-exceptions-with-za-state.ll | FileCheck %s + +; CHECK-LABEL: private_za_invoke: +; CHECK: .Lfunc_begin0: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: add x29, sp, #64 +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: .cfi_offset b8, -24 +; CHECK-NEXT: .cfi_offset b9, -32 +; CHECK-NEXT: .cfi_offset b10, -40 +; CHECK-NEXT: .cfi_offset b11, -48 +; CHECK-NEXT: .cfi_offset b12, -56 +; CHECK-NEXT: .cfi_offset b13, -64 +; CHECK-NEXT: .cfi_offset b14, -72 +; CHECK-NEXT: .cfi_offset b15, -80 +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: mul x9, x8, x8 +; CHECK-NEXT: sub x10, x10, x9 +; CHECK-NEXT: mov sp, x10 +; CHECK-NEXT: add x9, x9, #15 +; CHECK-NEXT: stur x10, [x29, #-112] +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x9, x10, x9 +; CHECK-NEXT: stur wzr, [x29, #-100] +; CHECK-NEXT: sturh wzr, [x29, #-102] +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: sub x10, x29, #96 +; CHECK-NEXT: stur x9, [x29, #-80] +; CHECK-NEXT: sturh w8, [x29, #-72] +; CHECK-NEXT: stur x10, [x29, #-70] +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x8, x29, #80 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: .Ltmp0: +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl normal_callee +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .Ltmp1: +; CHECK-NEXT: // %bb.1: // %check.za +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB0_3 +; CHECK-NEXT: // %bb.2: // %restore.za +; CHECK-NEXT: sub x0, x29, #80 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB0_3: // %return +; CHECK-NEXT: mov w0, #15 +; CHECK-NEXT: .LBB0_4: // %return +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: sub sp, x29, #64 +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_5: // %lpad +; CHECK-NEXT: .Ltmp2: +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: smstart sm +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #80 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: sub x8, x29, #112 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: sturh w9, [x29, #-104] +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: smstart sm +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #112 +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB0_7 +; CHECK-NEXT: // %bb.6: // %lpad +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB0_7: // %lpad +; CHECK-NEXT: mov w0, #23 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: b .LBB0_4 +; +; CHECK-LABEL: new_za_invoke: +; CHECK: .Lfunc_begin1: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: add x29, sp, #64 +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: .cfi_offset b8, -40 +; CHECK-NEXT: .cfi_offset b9, -48 +; CHECK-NEXT: .cfi_offset b10, -56 +; CHECK-NEXT: .cfi_offset b11, -64 +; CHECK-NEXT: .cfi_offset b12, -72 +; CHECK-NEXT: .cfi_offset b13, -80 +; CHECK-NEXT: .cfi_offset b14, -88 +; CHECK-NEXT: .cfi_offset b15, -96 +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: mul x9, x8, x8 +; CHECK-NEXT: sub x10, x10, x9 +; CHECK-NEXT: mov sp, x10 +; CHECK-NEXT: stur wzr, [x29, #-100] +; CHECK-NEXT: sturh wzr, [x29, #-102] +; CHECK-NEXT: stur x10, [x29, #-112] +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: add x9, x9, #15 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: sub x9, x10, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: sub x10, x29, #96 +; CHECK-NEXT: stur x9, [x29, #-80] +; CHECK-NEXT: sturh w8, [x29, #-72] +; CHECK-NEXT: stur x10, [x29, #-70] +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x8, x29, #80 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: .Ltmp3: +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: and x20, x0, #0x1 +; CHECK-NEXT: tbz x20, #0, .LBB1_2 +; CHECK-NEXT: // %bb.1: // %entry +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB1_2: // %entry +; CHECK-NEXT: bl snap_new +; CHECK-NEXT: tbz x20, #0, .LBB1_4 +; CHECK-NEXT: // %bb.3: // %entry +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB1_4: // %entry +; CHECK-NEXT: .Ltmp4: +; CHECK-NEXT: // %bb.5: // %check.za +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB1_7 +; CHECK-NEXT: // %bb.6: // %restore.za +; CHECK-NEXT: sub x0, x29, #80 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB1_7: // %return +; CHECK-NEXT: mov w0, #15 +; CHECK-NEXT: .LBB1_8: // %return +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: sub sp, x29, #64 +; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB1_9: // %lpad +; CHECK-NEXT: .Ltmp5: +; CHECK-NEXT: mov x2, x0 +; CHECK-NEXT: tbz x19, #0, .LBB1_11 +; CHECK-NEXT: // %bb.10: // %lpad +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB1_11: // %lpad +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: tbz x19, #0, .LBB1_13 +; CHECK-NEXT: // %bb.12: // %lpad +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB1_13: // %lpad +; CHECK-NEXT: mov x0, x2 +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: tbz x19, #0, .LBB1_15 +; CHECK-NEXT: // %bb.14: // %lpad +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB1_15: // %lpad +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #80 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: sub x8, x29, #112 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: sturh w9, [x29, #-104] +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: tbz x19, #0, .LBB1_17 +; CHECK-NEXT: // %bb.16: // %lpad +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB1_17: // %lpad +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: tbz x19, #0, .LBB1_19 +; CHECK-NEXT: // %bb.18: // %lpad +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB1_19: // %lpad +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #112 +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB1_21 +; CHECK-NEXT: // %bb.20: // %lpad +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB1_21: // %lpad +; CHECK-NEXT: mov w0, #23 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: b .LBB1_8 +; +; CHECK-LABEL: private_za_in_catch_block: +; CHECK: .Lfunc_begin2: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: mul x9, x8, x8 +; CHECK-NEXT: sub x10, x10, x9 +; CHECK-NEXT: mov sp, x10 +; CHECK-NEXT: add x9, x9, #15 +; CHECK-NEXT: stur x10, [x29, #-48] +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x9, x10, x9 +; CHECK-NEXT: stur wzr, [x29, #-36] +; CHECK-NEXT: sturh wzr, [x29, #-38] +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: sub x10, x29, #32 +; CHECK-NEXT: stur x9, [x29, #-16] +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: stur x10, [x29, #-6] +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp6: +; CHECK-NEXT: bl foo_shared +; CHECK-NEXT: .Ltmp7: +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: mov w0, #15 +; CHECK-NEXT: .LBB2_2: // %return +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB2_3: // %lpad1 +; CHECK-NEXT: .Ltmp8: +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: sub x19, x29, #48 +; CHECK-NEXT: rdsvl x20, #1 +; CHECK-NEXT: sturh w20, [x29, #-40] +; CHECK-NEXT: msr TPIDR2_EL0, x19 +; CHECK-NEXT: bl normal_callee +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #48 +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB2_5 +; CHECK-NEXT: // %bb.4: // %lpad1 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB2_5: // %lpad1 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: sturh w20, [x29, #-40] +; CHECK-NEXT: msr TPIDR2_EL0, x19 +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #48 +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB2_7 +; CHECK-NEXT: // %bb.6: // %lpad1 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB2_7: // %lpad1 +; CHECK-NEXT: mov w0, #23 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: b .LBB2_2 +; +; CHECK-LABEL: private_za_in_catch_block_from_new_za: +; CHECK: .Lfunc_begin3: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %prelude +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: mul x8, x8, x8 +; CHECK-NEXT: sub x9, x9, x8 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: add x8, x8, #15 +; CHECK-NEXT: stur x9, [x29, #-48] +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: and x8, x8, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x9, x8 +; CHECK-NEXT: stur wzr, [x29, #-36] +; CHECK-NEXT: sturh wzr, [x29, #-38] +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: sub x9, x29, #32 +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: stur x9, [x29, #-6] +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbz x8, .LBB3_2 +; CHECK-NEXT: // %bb.1: // %save.za +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .LBB3_2: // %entry +; CHECK-NEXT: smstart za +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp9: +; CHECK-NEXT: bl foo_shared +; CHECK-NEXT: .Ltmp10: +; CHECK-NEXT: // %bb.3: +; CHECK-NEXT: mov w0, #15 +; CHECK-NEXT: .LBB3_4: // %return +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: smstop za +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB3_5: // %lpad1 +; CHECK-NEXT: .Ltmp11: +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: sub x19, x29, #48 +; CHECK-NEXT: rdsvl x20, #1 +; CHECK-NEXT: sturh w20, [x29, #-40] +; CHECK-NEXT: msr TPIDR2_EL0, x19 +; CHECK-NEXT: bl normal_callee +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #48 +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB3_7 +; CHECK-NEXT: // %bb.6: // %lpad1 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB3_7: // %lpad1 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: sturh w20, [x29, #-40] +; CHECK-NEXT: msr TPIDR2_EL0, x19 +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #48 +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB3_9 +; CHECK-NEXT: // %bb.8: // %lpad1 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB3_9: // %lpad1 +; CHECK-NEXT: mov w0, #23 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: b .LBB3_4 +; +; CHECK-LABEL: call_after_catch_block: +; CHECK: .Lfunc_begin4: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: mul x9, x8, x8 +; CHECK-NEXT: sub x10, x10, x9 +; CHECK-NEXT: mov sp, x10 +; CHECK-NEXT: add x9, x9, #15 +; CHECK-NEXT: stur x10, [x29, #-48] +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x9, x10, x9 +; CHECK-NEXT: stur wzr, [x29, #-36] +; CHECK-NEXT: sturh wzr, [x29, #-38] +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: sub x10, x29, #32 +; CHECK-NEXT: stur x9, [x29, #-16] +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: stur x10, [x29, #-6] +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp12: +; CHECK-NEXT: bl foo_shared +; CHECK-NEXT: .Ltmp13: +; CHECK-NEXT: .LBB4_1: // %try.cont +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: b fizz_shared +; CHECK-NEXT: .LBB4_2: // %lpad1 +; CHECK-NEXT: .Ltmp14: +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: cmp w1, #1 +; CHECK-NEXT: b.ne .LBB4_10 +; CHECK-NEXT: // %bb.3: // %catch +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: sub x20, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp15: +; CHECK-NEXT: bl bar_shared +; CHECK-NEXT: .Ltmp16: +; CHECK-NEXT: // %bb.4: // %invoke.cont +; CHECK-NEXT: sub x8, x29, #48 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: sturh w9, [x29, #-40] +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #48 +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB4_6 +; CHECK-NEXT: // %bb.5: // %invoke.cont +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB4_6: // %invoke.cont +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: b .LBB4_1 +; CHECK-NEXT: .LBB4_7: // %lpad2 +; CHECK-NEXT: .Ltmp17: +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: .Ltmp18: +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: .Ltmp19: +; CHECK-NEXT: // %bb.8: // %check.za +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB4_10 +; CHECK-NEXT: // %bb.9: // %restore.za +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB4_10: // %eh.resume +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: smstop za +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl _Unwind_Resume +; CHECK-NEXT: .LBB4_11: // %terminate.lpad +; CHECK-NEXT: .Ltmp20: +; CHECK-NEXT: bl __clang_call_terminate +; +; CHECK-LABEL: multiple_catch: +; CHECK: .Lfunc_begin5: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: mul x9, x8, x8 +; CHECK-NEXT: sub x10, x10, x9 +; CHECK-NEXT: mov sp, x10 +; CHECK-NEXT: add x9, x9, #15 +; CHECK-NEXT: stur x10, [x29, #-48] +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x9, x10, x9 +; CHECK-NEXT: stur wzr, [x29, #-36] +; CHECK-NEXT: sturh wzr, [x29, #-38] +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: sub x10, x29, #32 +; CHECK-NEXT: stur x9, [x29, #-16] +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: stur x10, [x29, #-6] +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp21: +; CHECK-NEXT: bl foo_shared +; CHECK-NEXT: .Ltmp22: +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: mov w19, #42 +; CHECK-NEXT: .LBB5_2: // %return +; CHECK-NEXT: mov w0, w19 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB5_3: // %lpad1 +; CHECK-NEXT: .Ltmp23: +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: cmp w1, #2 +; CHECK-NEXT: b.ne .LBB5_6 +; CHECK-NEXT: // %bb.4: // %catch1 +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: sub x20, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp29: +; CHECK-NEXT: bl bar_shared +; CHECK-NEXT: .Ltmp30: +; CHECK-NEXT: // %bb.5: +; CHECK-NEXT: mov w19, #42 +; CHECK-NEXT: b .LBB5_9 +; CHECK-NEXT: .LBB5_6: // %catch.fallthrough +; CHECK-NEXT: cmp w1, #1 +; CHECK-NEXT: b.ne .LBB5_16 +; CHECK-NEXT: // %bb.7: // %catch2 +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: sub x20, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp24: +; CHECK-NEXT: bl fizz_shared +; CHECK-NEXT: .Ltmp25: +; CHECK-NEXT: // %bb.8: +; CHECK-NEXT: mov w19, #23 +; CHECK-NEXT: .LBB5_9: // %return.sink.split +; CHECK-NEXT: sub x8, x29, #48 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: sturh w9, [x29, #-40] +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #48 +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB5_11 +; CHECK-NEXT: // %bb.10: // %return.sink.split +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB5_11: // %return.sink.split +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: b .LBB5_2 +; CHECK-NEXT: .LBB5_12: // %lpad2 +; CHECK-NEXT: .Ltmp26: +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: .Ltmp27: +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: .Ltmp28: +; CHECK-NEXT: b .LBB5_14 +; CHECK-NEXT: .LBB5_13: // %lpad3 +; CHECK-NEXT: .Ltmp31: +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: .Ltmp32: +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: .Ltmp33: +; CHECK-NEXT: .LBB5_14: // %check.za13 +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB5_16 +; CHECK-NEXT: // %bb.15: // %restore.za14 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB5_16: // %eh.resume +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: smstop za +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl _Unwind_Resume +; CHECK-NEXT: .LBB5_17: // %terminate.lpad +; CHECK-NEXT: .Ltmp34: +; CHECK-NEXT: bl __clang_call_terminate +; +; CHECK-LABEL: try_throw: +; CHECK: .Lfunc_begin6: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: rdsvl x19, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: mul x8, x19, x19 +; CHECK-NEXT: sub x9, x9, x8 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: add x8, x8, #15 +; CHECK-NEXT: stur x9, [x29, #-48] +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: and x8, x8, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x9, x8 +; CHECK-NEXT: stur wzr, [x29, #-36] +; CHECK-NEXT: sturh wzr, [x29, #-38] +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: sub x9, x29, #32 +; CHECK-NEXT: sub x10, x29, #48 +; CHECK-NEXT: mov w0, #4 +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh w19, [x29, #-40] +; CHECK-NEXT: stur x9, [x29, #-6] +; CHECK-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEXT: bl __cxa_allocate_exception +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #48 +; CHECK-NEXT: mrs x9, TPIDR2_EL0 +; CHECK-NEXT: cbnz x9, .LBB6_2 +; CHECK-NEXT: // %bb.1: // %entry +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB6_2: // %entry +; CHECK-NEXT: mov w9, #23 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: sturh w19, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x19, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x19 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: .Ltmp35: +; CHECK-NEXT: adrp x1, :got:except +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: mov x2, xzr +; CHECK-NEXT: ldr x1, [x1, :got_lo12:except] +; CHECK-NEXT: bl __cxa_throw +; CHECK-NEXT: .Ltmp36: +; CHECK-NEXT: // %bb.3: // %check.za +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB6_5 +; CHECK-NEXT: // %bb.4: // %restore.za +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB6_5: // %unreachable +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .LBB6_6: // %lpad +; CHECK-NEXT: .Ltmp37: +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x19 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp38: +; CHECK-NEXT: bl foo_shared +; CHECK-NEXT: .Ltmp39: +; CHECK-NEXT: // %bb.7: // %invoke.cont +; CHECK-NEXT: sub x8, x29, #48 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: sturh w9, [x29, #-40] +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #48 +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB6_9 +; CHECK-NEXT: // %bb.8: // %invoke.cont +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB6_9: // %invoke.cont +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: b bar_shared +; CHECK-NEXT: .LBB6_10: // %lpad1 +; CHECK-NEXT: .Ltmp40: +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: .Ltmp41: +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: .Ltmp42: +; CHECK-NEXT: // %bb.11: // %check.za5 +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB6_13 +; CHECK-NEXT: // %bb.12: // %restore.za6 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB6_13: // %eh.resume +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: smstop za +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl _Unwind_Resume +; CHECK-NEXT: .LBB6_14: // %terminate.lpad +; CHECK-NEXT: .Ltmp43: +; CHECK-NEXT: bl __clang_call_terminate +; +; CHECK-LABEL: catch_throw: +; CHECK: .Lfunc_begin7: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: mul x9, x8, x8 +; CHECK-NEXT: sub x10, x10, x9 +; CHECK-NEXT: mov sp, x10 +; CHECK-NEXT: add x9, x9, #15 +; CHECK-NEXT: stur x10, [x29, #-48] +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x9, x10, x9 +; CHECK-NEXT: stur wzr, [x29, #-36] +; CHECK-NEXT: sturh wzr, [x29, #-38] +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: sub x10, x29, #32 +; CHECK-NEXT: stur x9, [x29, #-16] +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: stur x10, [x29, #-6] +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x19, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x19 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp44: +; CHECK-NEXT: bl foo_shared +; CHECK-NEXT: .Ltmp45: +; CHECK-NEXT: // %bb.1: // %try.cont +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: b bar_shared +; CHECK-NEXT: .LBB7_2: // %lpad1 +; CHECK-NEXT: .Ltmp46: +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: sub x8, x29, #48 +; CHECK-NEXT: mov w0, #4 +; CHECK-NEXT: rdsvl x20, #1 +; CHECK-NEXT: sturh w20, [x29, #-40] +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __cxa_allocate_exception +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #48 +; CHECK-NEXT: mrs x9, TPIDR2_EL0 +; CHECK-NEXT: cbnz x9, .LBB7_4 +; CHECK-NEXT: // %bb.3: // %lpad1 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB7_4: // %lpad1 +; CHECK-NEXT: mov w9, #23 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: sturh w20, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x19 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: .Ltmp47: +; CHECK-NEXT: adrp x1, :got:except +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: mov x2, xzr +; CHECK-NEXT: ldr x1, [x1, :got_lo12:except] +; CHECK-NEXT: bl __cxa_throw +; CHECK-NEXT: .Ltmp48: +; CHECK-NEXT: // %bb.5: // %check.za +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB7_7 +; CHECK-NEXT: // %bb.6: // %restore.za +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB7_7: // %unreachable +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .LBB7_8: // %lpad2 +; CHECK-NEXT: .Ltmp49: +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: .Ltmp50: +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: .Ltmp51: +; CHECK-NEXT: // %bb.9: // %check.za5 +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB7_11 +; CHECK-NEXT: // %bb.10: // %restore.za6 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB7_11: // %eh.resume +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: smstop za +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl _Unwind_Resume +; CHECK-NEXT: .LBB7_12: // %terminate.lpad +; CHECK-NEXT: .Ltmp52: +; CHECK-NEXT: bl __clang_call_terminate +; +; CHECK-LABEL: nested: +; CHECK: .Lfunc_begin8: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: mul x9, x8, x8 +; CHECK-NEXT: sub x10, x10, x9 +; CHECK-NEXT: mov sp, x10 +; CHECK-NEXT: add x9, x9, #15 +; CHECK-NEXT: stur x10, [x29, #-48] +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x9, x10, x9 +; CHECK-NEXT: stur wzr, [x29, #-36] +; CHECK-NEXT: sturh wzr, [x29, #-38] +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: sub x10, x29, #32 +; CHECK-NEXT: stur x9, [x29, #-16] +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: stur x10, [x29, #-6] +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x19, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x19 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp53: +; CHECK-NEXT: bl foo_shared +; CHECK-NEXT: .Ltmp54: +; CHECK-NEXT: // %bb.1: // %invoke.cont1 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x19 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp56: +; CHECK-NEXT: bl bar_shared +; CHECK-NEXT: .Ltmp57: +; CHECK-NEXT: .LBB8_2: // %try.cont14 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB8_3: // %lpad2 +; CHECK-NEXT: .Ltmp58: +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: cmp w1, #2 +; CHECK-NEXT: b.ne .LBB8_14 +; CHECK-NEXT: // %bb.4: // %catch1 +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: sub x20, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp59: +; CHECK-NEXT: bl fizz_shared +; CHECK-NEXT: .Ltmp60: +; CHECK-NEXT: // %bb.5: // %invoke.cont2 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: .Ltmp64: +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: .Ltmp65: +; CHECK-NEXT: // %bb.6: // %check.za +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB8_2 +; CHECK-NEXT: // %bb.7: // %restore.za +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: b .LBB8_2 +; CHECK-NEXT: .LBB8_8: // %lpad4 +; CHECK-NEXT: .Ltmp66: +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: b .LBB8_12 +; CHECK-NEXT: .LBB8_9: // %lpad3 +; CHECK-NEXT: .Ltmp61: +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: .Ltmp62: +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: .Ltmp63: +; CHECK-NEXT: // %bb.10: // %check.za13 +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB8_12 +; CHECK-NEXT: // %bb.11: // %restore.za14 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB8_12: // %eh.cleanup +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: b .LBB8_14 +; CHECK-NEXT: .LBB8_13: // %lpad1 +; CHECK-NEXT: .Ltmp55: +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: .LBB8_14: // %catch2 +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: sub x20, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp67: +; CHECK-NEXT: bl buzz_shared +; CHECK-NEXT: .Ltmp68: +; CHECK-NEXT: // %bb.15: // %invoke.cont3 +; CHECK-NEXT: sub x8, x29, #48 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: sturh w9, [x29, #-40] +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #48 +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB8_17 +; CHECK-NEXT: // %bb.16: // %invoke.cont3 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB8_17: // %invoke.cont3 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: b .LBB8_2 +; CHECK-NEXT: .LBB8_18: // %lpad5 +; CHECK-NEXT: .Ltmp69: +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: .Ltmp70: +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: .Ltmp71: +; CHECK-NEXT: // %bb.19: // %check.za25 +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB8_21 +; CHECK-NEXT: // %bb.20: // %restore.za26 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB8_21: // %eh.resume +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: smstop za +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl _Unwind_Resume +; CHECK-NEXT: .LBB8_22: // %terminate.lpad +; CHECK-NEXT: .Ltmp72: +; CHECK-NEXT: bl __clang_call_terminate +; +; CHECK-LABEL: conditional_throw: +; CHECK: .Lfunc_begin9: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: mul x8, x8, x8 +; CHECK-NEXT: sub x9, x9, x8 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: add x8, x8, #15 +; CHECK-NEXT: stur x9, [x29, #-48] +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: and x8, x8, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x9, x8 +; CHECK-NEXT: stur wzr, [x29, #-36] +; CHECK-NEXT: sturh wzr, [x29, #-38] +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: sub x9, x29, #32 +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: stur x9, [x29, #-6] +; CHECK-NEXT: tbnz w0, #0, .LBB9_3 +; CHECK-NEXT: // %bb.1: // %if.end +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp73: +; CHECK-NEXT: bl bar_shared +; CHECK-NEXT: .Ltmp74: +; CHECK-NEXT: .LBB9_2: // %try.cont +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB9_3: // %if.then +; CHECK-NEXT: sub x8, x29, #48 +; CHECK-NEXT: mov w0, #4 +; CHECK-NEXT: rdsvl x19, #1 +; CHECK-NEXT: sturh w19, [x29, #-40] +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __cxa_allocate_exception +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #48 +; CHECK-NEXT: mrs x9, TPIDR2_EL0 +; CHECK-NEXT: cbnz x9, .LBB9_5 +; CHECK-NEXT: // %bb.4: // %if.then +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB9_5: // %if.then +; CHECK-NEXT: mov w9, #23 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: sturh w19, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x9, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: .Ltmp75: +; CHECK-NEXT: adrp x1, :got:except +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: mov x2, xzr +; CHECK-NEXT: ldr x1, [x1, :got_lo12:except] +; CHECK-NEXT: bl __cxa_throw +; CHECK-NEXT: .Ltmp76: +; CHECK-NEXT: // %bb.6: // %check.za +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB9_8 +; CHECK-NEXT: // %bb.7: // %restore.za +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB9_8: // %unreachable +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .LBB9_9: // %lpad +; CHECK-NEXT: .Ltmp77: +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: sub x20, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp78: +; CHECK-NEXT: bl fizz_shared +; CHECK-NEXT: .Ltmp79: +; CHECK-NEXT: // %bb.10: // %invoke.cont2 +; CHECK-NEXT: sub x8, x29, #48 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: sturh w9, [x29, #-40] +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #48 +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB9_12 +; CHECK-NEXT: // %bb.11: // %invoke.cont2 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB9_12: // %invoke.cont2 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: b .LBB9_2 +; CHECK-NEXT: .LBB9_13: // %lpad1 +; CHECK-NEXT: .Ltmp80: +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: .Ltmp81: +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: .Ltmp82: +; CHECK-NEXT: // %bb.14: // %check.za9 +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB9_16 +; CHECK-NEXT: // %bb.15: // %restore.za10 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB9_16: // %eh.resume +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: smstop za +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl _Unwind_Resume +; CHECK-NEXT: .LBB9_17: // %terminate.lpad +; CHECK-NEXT: .Ltmp83: +; CHECK-NEXT: bl __clang_call_terminate +; +; CHECK-LABEL: throw_in_signature: +; CHECK: .Lfunc_begin10: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: mul x8, x8, x8 +; CHECK-NEXT: sub x9, x9, x8 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: add x8, x8, #15 +; CHECK-NEXT: stur x9, [x29, #-48] +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: and x8, x8, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x9, x8 +; CHECK-NEXT: stur wzr, [x29, #-36] +; CHECK-NEXT: sturh wzr, [x29, #-38] +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: sub x9, x29, #32 +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: stur x9, [x29, #-6] +; CHECK-NEXT: tbz w0, #0, .LBB10_3 +; CHECK-NEXT: // %bb.1: // %if.then +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp86: +; CHECK-NEXT: bl bar_shared +; CHECK-NEXT: .Ltmp87: +; CHECK-NEXT: // %bb.2: +; CHECK-NEXT: mov w0, #42 +; CHECK-NEXT: b .LBB10_5 +; CHECK-NEXT: .LBB10_3: // %if.end +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp84: +; CHECK-NEXT: bl fizz_shared +; CHECK-NEXT: .Ltmp85: +; CHECK-NEXT: // %bb.4: +; CHECK-NEXT: mov w0, #23 +; CHECK-NEXT: .LBB10_5: // %return +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB10_6: // %lpad +; CHECK-NEXT: .Ltmp88: +; CHECK-NEXT: bl __cxa_call_unexpected +; +; CHECK-LABEL: try_func: +; CHECK: .Lfunc_begin11: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: mul x9, x8, x8 +; CHECK-NEXT: sub x10, x10, x9 +; CHECK-NEXT: mov sp, x10 +; CHECK-NEXT: add x9, x9, #15 +; CHECK-NEXT: stur x10, [x29, #-48] +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x9, x10, x9 +; CHECK-NEXT: stur wzr, [x29, #-36] +; CHECK-NEXT: sturh wzr, [x29, #-38] +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: sub x10, x29, #32 +; CHECK-NEXT: stur x9, [x29, #-16] +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: stur x10, [x29, #-6] +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp89: +; CHECK-NEXT: bl fizz_shared +; CHECK-NEXT: .Ltmp90: +; CHECK-NEXT: .LBB11_1: // %try.cont +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB11_2: // %lpad +; CHECK-NEXT: .Ltmp91: +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: cmp w1, #1 +; CHECK-NEXT: b.ne .LBB11_10 +; CHECK-NEXT: // %bb.3: // %catch +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: sub x20, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp92: +; CHECK-NEXT: bl buzz_shared +; CHECK-NEXT: .Ltmp93: +; CHECK-NEXT: // %bb.4: // %invoke.cont +; CHECK-NEXT: sub x8, x29, #48 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: sturh w9, [x29, #-40] +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #48 +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB11_6 +; CHECK-NEXT: // %bb.5: // %invoke.cont +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB11_6: // %invoke.cont +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: b .LBB11_1 +; CHECK-NEXT: .LBB11_7: // %lpad1 +; CHECK-NEXT: .Ltmp94: +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: .Ltmp95: +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: .Ltmp96: +; CHECK-NEXT: // %bb.8: // %check.za +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB11_10 +; CHECK-NEXT: // %bb.9: // %restore.za +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB11_10: // %eh.resume +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: smstop za +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl _Unwind_Resume +; CHECK-NEXT: .LBB11_11: // %terminate.lpad +; CHECK-NEXT: .Ltmp97: +; CHECK-NEXT: bl __clang_call_terminate diff --git a/llvm/test/CodeGen/AArch64/sme-exceptions-with-za-state.ll b/llvm/test/CodeGen/AArch64/sme-exceptions-with-za-state.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-exceptions-with-za-state.ll @@ -0,0 +1,1607 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -mtriple=aarch64-linux-gnu -aarch64-sme-abi %s | FileCheck %s +; RUN: opt -S -mtriple=aarch64-linux-gnu -aarch64-sme-abi -aarch64-sme-abi %s | FileCheck %s + +declare void @foo_shared() "aarch64_pstate_za_shared" +declare void @bar_shared() "aarch64_pstate_za_shared" +declare void @fizz_shared() "aarch64_pstate_za_shared" +declare void @buzz_shared() "aarch64_pstate_za_shared" + +declare void @snap_new() "aarch64_pstate_za_new" + +declare i32 @normal_callee() + +@except = external constant ptr +@invalid_argument = external constant ptr +@overflow_error = external constant ptr +@runtime_error = external constant ptr + +; Try/Catch with one invoke of a private ZA function. +; Lazy-save must be set up for the invoke and for the private callee. + +define i32 @private_za_invoke() "aarch64_pstate_za_shared" "aarch64_pstate_sm_enabled" personality i32 1 { +; CHECK-LABEL: @private_za_invoke( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ:%.*]] = alloca { ptr, i16, [6 x i8] }, align 8 +; CHECK-NEXT: [[N:%.*]] = call i64 @llvm.aarch64.sme.cntsb() +; CHECK-NEXT: [[NN:%.*]] = mul i64 [[N]], [[N]] +; CHECK-NEXT: [[BUFFER:%.*]] = alloca i8, i64 [[NN]], align 16 +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ_BUFFER:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 0 +; CHECK-NEXT: store ptr [[BUFFER]], ptr [[TPIDR2_INVOKE_OBJ_BUFFER]], align 8 +; CHECK-NEXT: [[ZERO:%.*]] = alloca i48, i48 0, align 16 +; CHECK-NEXT: [[RESERVED:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 2 +; CHECK-NEXT: store ptr [[ZERO]], ptr [[RESERVED]], align 8 +; CHECK-NEXT: [[LIVE:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC:%.*]] = trunc i64 [[LIVE]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC]], ptr [[TPIDR2_OBJ_LIVE]], align 2 +; CHECK-NEXT: [[TPI_INT:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: invoke void @normal_callee() +; CHECK-NEXT: to label [[CHECK_ZA:%.*]] unwind label [[LPAD:%.*]] +; CHECK: lpad: +; CHECK-NEXT: [[TMP0:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: call void @llvm.aarch64.sme.invoke.resume.pstatesm(i64 1) +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP1]]) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: tail call void @__cxa_end_catch() +; CHECK-NEXT: br label [[RETURN:%.*]] +; CHECK: check.za: +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: [[TPIDR2:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[TPIDR2]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[RESTORE_ZA:%.*]], label [[RETURN]] +; CHECK: restore.za: +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: br label [[RETURN]] +; CHECK: return: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i32 [ 23, [[LPAD]] ], [ 15, [[CHECK_ZA]] ], [ 15, [[RESTORE_ZA]] ] +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: ret i32 [[RETVAL]] +; +entry: + invoke void @normal_callee() to label %return unwind label %lpad + +lpad: + %0 = landingpad { ptr, i32 } + catch ptr null + %1 = extractvalue { ptr, i32 } %0, 0 + %2 = tail call ptr @__cxa_begin_catch(ptr %1) + tail call void @__cxa_end_catch() + br label %return + +return: + %retval = phi i32 [ 23, %lpad ], [ 15, %entry ] + ret i32 %retval +} + +; Try/Catch with one invoke of a new ZA function. + +define i32 @new_za_invoke() "aarch64_pstate_za_shared" "aarch64_pstate_sm_compatible" personality i32 1 { +; CHECK-LABEL: @new_za_invoke( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call aarch64_sme_preservemost_from_x2 [[TMP0]] @__arm_sme_state() +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue [[TMP0]] [[TMP0]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], 1 +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ:%.*]] = alloca { ptr, i16, [6 x i8] }, align 8 +; CHECK-NEXT: [[N:%.*]] = call i64 @llvm.aarch64.sme.cntsb() +; CHECK-NEXT: [[NN:%.*]] = mul i64 [[N]], [[N]] +; CHECK-NEXT: [[BUFFER:%.*]] = alloca i8, i64 [[NN]], align 16 +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ_BUFFER:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 0 +; CHECK-NEXT: store ptr [[BUFFER]], ptr [[TPIDR2_INVOKE_OBJ_BUFFER]], align 8 +; CHECK-NEXT: [[ZERO:%.*]] = alloca i48, i48 0, align 16 +; CHECK-NEXT: [[RESERVED:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 2 +; CHECK-NEXT: store ptr [[ZERO]], ptr [[RESERVED]], align 8 +; CHECK-NEXT: [[LIVE:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC:%.*]] = trunc i64 [[LIVE]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC]], ptr [[TPIDR2_OBJ_LIVE]], align 2 +; CHECK-NEXT: [[TPI_INT:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT]]) +; CHECK-NEXT: invoke void @snap_new() +; CHECK-NEXT: to label [[CHECK_ZA:%.*]] unwind label [[LPAD:%.*]] +; CHECK: lpad: +; CHECK-NEXT: [[TMP3:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: call void @llvm.aarch64.sme.invoke.resume.pstatesm(i64 [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { ptr, i32 } [[TMP3]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP4]]) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: tail call void @__cxa_end_catch() +; CHECK-NEXT: br label [[RETURN:%.*]] +; CHECK: check.za: +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: [[TPIDR2:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[TPIDR2]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[RESTORE_ZA:%.*]], label [[RETURN]] +; CHECK: restore.za: +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: br label [[RETURN]] +; CHECK: return: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i32 [ 23, [[LPAD]] ], [ 15, [[CHECK_ZA]] ], [ 15, [[RESTORE_ZA]] ] +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: ret i32 [[RETVAL]] +; +entry: + invoke void @snap_new() to label %return unwind label %lpad + +lpad: + %0 = landingpad { ptr, i32 } + catch ptr null + %1 = extractvalue { ptr, i32 } %0, 0 + %2 = tail call ptr @__cxa_begin_catch(ptr %1) + tail call void @__cxa_end_catch() + br label %return + +return: + %retval = phi i32 [ 23, %lpad ], [ 15, %entry ] + ret i32 %retval +} + +; Try/Catch with one invoke of a shared ZA function and a private ZA call in the +; (unconditional) catch block. + +; int unconditional() { +; try { +; foo(); +; } catch (...) { +; normal_callee(); +; return 23; +; } +; return 15; +; } + +define i32 @private_za_in_catch_block() "aarch64_pstate_za_shared" personality i32 1 { +; CHECK-LABEL: @private_za_in_catch_block( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ:%.*]] = alloca { ptr, i16, [6 x i8] }, align 8 +; CHECK-NEXT: [[N:%.*]] = call i64 @llvm.aarch64.sme.cntsb() +; CHECK-NEXT: [[NN:%.*]] = mul i64 [[N]], [[N]] +; CHECK-NEXT: [[BUFFER:%.*]] = alloca i8, i64 [[NN]], align 16 +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ_BUFFER:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 0 +; CHECK-NEXT: store ptr [[BUFFER]], ptr [[TPIDR2_INVOKE_OBJ_BUFFER]], align 8 +; CHECK-NEXT: [[ZERO:%.*]] = alloca i48, i48 0, align 16 +; CHECK-NEXT: [[RESERVED:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 2 +; CHECK-NEXT: store ptr [[ZERO]], ptr [[RESERVED]], align 8 +; CHECK-NEXT: [[LIVE:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC:%.*]] = trunc i64 [[LIVE]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC]], ptr [[TPIDR2_OBJ_LIVE]], align 2 +; CHECK-NEXT: [[TPI_INT:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @foo_shared() +; CHECK-NEXT: to label [[RETURN:%.*]] unwind label [[LPAD1:%.*]] +; CHECK: lpad1: +; CHECK-NEXT: [[TMP0:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP1]]) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: call void @normal_callee() +; CHECK-NEXT: tail call void @__cxa_end_catch() +; CHECK-NEXT: br label [[RETURN]] +; CHECK: return: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i32 [ 23, [[LPAD1]] ], [ 15, [[ENTRY:%.*]] ] +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: ret i32 [[RETVAL]] +; +entry: + invoke void @foo_shared() to label %return unwind label %lpad1 + +lpad1: + %0 = landingpad { ptr, i32 } + catch ptr null + %1 = extractvalue { ptr, i32 } %0, 0 + %2 = tail call ptr @__cxa_begin_catch(ptr %1) + call void @normal_callee() + tail call void @__cxa_end_catch() + br label %return + +return: + %retval = phi i32 [ 23, %lpad1 ], [ 15, %entry ] + ret i32 %retval +} + +; Try/Catch with one invoke of a shared ZA function and a private ZA call in the +; (unconditional) catch block. + +; int unconditional() { +; try { +; foo(); +; } catch (...) { +; normal_callee(); +; return 23; +; } +; return 15; +; } + +define i32 @private_za_in_catch_block_from_new_za() "aarch64_pstate_za_new" personality i32 1 { +; CHECK-LABEL: @private_za_in_catch_block_from_new_za( +; CHECK-NEXT: prelude: +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ:%.*]] = alloca { ptr, i16, [6 x i8] }, align 8 +; CHECK-NEXT: [[N:%.*]] = call i64 @llvm.aarch64.sme.cntsb() +; CHECK-NEXT: [[NN:%.*]] = mul i64 [[N]], [[N]] +; CHECK-NEXT: [[BUFFER:%.*]] = alloca i8, i64 [[NN]], align 16 +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ_BUFFER:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 0 +; CHECK-NEXT: store ptr [[BUFFER]], ptr [[TPIDR2_INVOKE_OBJ_BUFFER]], align 8 +; CHECK-NEXT: [[ZERO:%.*]] = alloca i48, i48 0, align 16 +; CHECK-NEXT: [[RESERVED:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 2 +; CHECK-NEXT: store ptr [[ZERO]], ptr [[RESERVED]], align 8 +; CHECK-NEXT: [[TPIDR2:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i64 [[TPIDR2]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[SAVE_ZA:%.*]], label [[ENTRY:%.*]] +; CHECK: save.za: +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: br label [[ENTRY]] +; CHECK: entry: +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: [[LIVE:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC:%.*]] = trunc i64 [[LIVE]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC]], ptr [[TPIDR2_OBJ_LIVE]], align 2 +; CHECK-NEXT: [[TPI_INT:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @foo_shared() +; CHECK-NEXT: to label [[RETURN:%.*]] unwind label [[LPAD1:%.*]] +; CHECK: lpad1: +; CHECK-NEXT: [[TMP0:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP1]]) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: call void @normal_callee() +; CHECK-NEXT: tail call void @__cxa_end_catch() +; CHECK-NEXT: br label [[RETURN]] +; CHECK: return: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i32 [ 23, [[LPAD1]] ], [ 15, [[ENTRY]] ] +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.disable() +; CHECK-NEXT: ret i32 [[RETVAL]] +; +entry: + invoke void @foo_shared() to label %return unwind label %lpad1 + +lpad1: + %0 = landingpad { ptr, i32 } + catch ptr null + %1 = extractvalue { ptr, i32 } %0, 0 + %2 = tail call ptr @__cxa_begin_catch(ptr %1) + call void @normal_callee() + tail call void @__cxa_end_catch() + br label %return + +return: + %retval = phi i32 [ 23, %lpad1 ], [ 15, %entry ] + ret i32 %retval +} + + +; +; Try/Catch with one catch clause. Set up & restore lazy-save +; +; void except_func() { +; try { +; foo(); +; } catch (const std::invalid_argument& e) { +; bar(); +; } +; fizz(); +; } + +define void @call_after_catch_block() "aarch64_pstate_za_shared" personality i32 0 { +; CHECK-LABEL: @call_after_catch_block( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ:%.*]] = alloca { ptr, i16, [6 x i8] }, align 8 +; CHECK-NEXT: [[N:%.*]] = call i64 @llvm.aarch64.sme.cntsb() +; CHECK-NEXT: [[NN:%.*]] = mul i64 [[N]], [[N]] +; CHECK-NEXT: [[BUFFER:%.*]] = alloca i8, i64 [[NN]], align 16 +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ_BUFFER:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 0 +; CHECK-NEXT: store ptr [[BUFFER]], ptr [[TPIDR2_INVOKE_OBJ_BUFFER]], align 8 +; CHECK-NEXT: [[ZERO:%.*]] = alloca i48, i48 0, align 16 +; CHECK-NEXT: [[RESERVED:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 2 +; CHECK-NEXT: store ptr [[ZERO]], ptr [[RESERVED]], align 8 +; CHECK-NEXT: [[LIVE:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC:%.*]] = trunc i64 [[LIVE]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC]], ptr [[TPIDR2_OBJ_LIVE]], align 2 +; CHECK-NEXT: [[TPI_INT:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @foo_shared() +; CHECK-NEXT: to label [[TRY_CONT:%.*]] unwind label [[LPAD1:%.*]] +; CHECK: lpad1: +; CHECK-NEXT: [[TMP0:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr @invalid_argument +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = tail call i32 @llvm.eh.typeid.for(ptr nonnull @invalid_argument) +; CHECK-NEXT: [[MATCHES:%.*]] = icmp eq i32 [[TMP1]], [[TMP2]] +; CHECK-NEXT: br i1 [[MATCHES]], label [[CATCH:%.*]], label [[EH_RESUME:%.*]] +; CHECK: catch: +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP3]]) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: [[LIVE1:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC2:%.*]] = trunc i64 [[LIVE1]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE3:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC2]], ptr [[TPIDR2_OBJ_LIVE3]], align 2 +; CHECK-NEXT: [[TPI_INT4:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT4]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @bar_shared() +; CHECK-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD2:%.*]] +; CHECK: invoke.cont: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: tail call void @__cxa_end_catch() +; CHECK-NEXT: br label [[TRY_CONT]] +; CHECK: try.cont: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: tail call void @fizz_shared() +; CHECK-NEXT: ret void +; CHECK: lpad2: +; CHECK-NEXT: [[TMP5:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: cleanup +; CHECK-NEXT: [[LIVE5:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC6:%.*]] = trunc i64 [[LIVE5]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE7:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC6]], ptr [[TPIDR2_OBJ_LIVE7]], align 2 +; CHECK-NEXT: [[TPI_INT8:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT8]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: invoke void @__cxa_end_catch() +; CHECK-NEXT: to label [[CHECK_ZA:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +; CHECK: check.za: +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: [[TPIDR2:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[TPIDR2]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[RESTORE_ZA:%.*]], label [[EH_RESUME]] +; CHECK: restore.za: +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: br label [[EH_RESUME]] +; CHECK: eh.resume: +; CHECK-NEXT: [[LPAD_VAL6_MERGED:%.*]] = phi { ptr, i32 } [ [[TMP0]], [[LPAD1]] ], [ [[TMP5]], [[CHECK_ZA]] ], [ [[TMP5]], [[RESTORE_ZA]] ] +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.disable() +; CHECK-NEXT: resume { ptr, i32 } [[LPAD_VAL6_MERGED]] +; CHECK: terminate.lpad: +; CHECK-NEXT: [[TMP6:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { ptr, i32 } [[TMP6]], 0 +; CHECK-NEXT: tail call void @__clang_call_terminate(ptr [[TMP7]]) +; CHECK-NEXT: unreachable +; + +entry: + invoke void @foo_shared() to label %try.cont unwind label %lpad1 + +lpad1: + %0 = landingpad { ptr, i32 } catch ptr @invalid_argument + %1 = extractvalue { ptr, i32 } %0, 1 + %2 = tail call i32 @llvm.eh.typeid.for(ptr nonnull @invalid_argument) + %matches = icmp eq i32 %1, %2 + br i1 %matches, label %catch, label %eh.resume + +catch: + %3 = extractvalue { ptr, i32 } %0, 0 + %4 = tail call ptr @__cxa_begin_catch(ptr %3) + invoke void @bar_shared() to label %invoke.cont unwind label %lpad2 + +invoke.cont: + tail call void @__cxa_end_catch() + br label %try.cont + +try.cont: + tail call void @fizz_shared() + ret void + +lpad2: + %5 = landingpad { ptr, i32 } cleanup + invoke void @__cxa_end_catch() to label %eh.resume unwind label %terminate.lpad + +eh.resume: + %lpad.val6.merged = phi { ptr, i32 } [ %0, %lpad1 ], [ %5, %lpad2 ] + resume { ptr, i32 } %lpad.val6.merged + +terminate.lpad: + %6 = landingpad { ptr, i32 } catch ptr null + %7 = extractvalue { ptr, i32 } %6, 0 + tail call void @__clang_call_terminate(ptr %7) + unreachable +} + +; +; Try/Catch with two catch clauses. Set up & restore lazy-save +; +; int multiple_catch() { +; try { +; foo(); +; } catch (const std::overflow_error& e) { +; bar(); +; } catch (const std::runtime_error& e) { +; fizz(); +; return 23; +; } +; return 42; +; } + +define i32 @multiple_catch() "aarch64_pstate_za_shared" personality i32 0 { +; CHECK-LABEL: @multiple_catch( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ:%.*]] = alloca { ptr, i16, [6 x i8] }, align 8 +; CHECK-NEXT: [[N:%.*]] = call i64 @llvm.aarch64.sme.cntsb() +; CHECK-NEXT: [[NN:%.*]] = mul i64 [[N]], [[N]] +; CHECK-NEXT: [[BUFFER:%.*]] = alloca i8, i64 [[NN]], align 16 +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ_BUFFER:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 0 +; CHECK-NEXT: store ptr [[BUFFER]], ptr [[TPIDR2_INVOKE_OBJ_BUFFER]], align 8 +; CHECK-NEXT: [[ZERO:%.*]] = alloca i48, i48 0, align 16 +; CHECK-NEXT: [[RESERVED:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 2 +; CHECK-NEXT: store ptr [[ZERO]], ptr [[RESERVED]], align 8 +; CHECK-NEXT: [[LIVE:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC:%.*]] = trunc i64 [[LIVE]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC]], ptr [[TPIDR2_OBJ_LIVE]], align 2 +; CHECK-NEXT: [[TPI_INT:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @foo_shared() +; CHECK-NEXT: to label [[RETURN:%.*]] unwind label [[LPAD1:%.*]] +; CHECK: lpad1: +; CHECK-NEXT: [[TMP0:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr @overflow_error +; CHECK-NEXT: catch ptr @runtime_error +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = tail call i32 @llvm.eh.typeid.for(ptr nonnull @overflow_error) +; CHECK-NEXT: [[MATCHES:%.*]] = icmp eq i32 [[TMP2]], [[TMP3]] +; CHECK-NEXT: br i1 [[MATCHES]], label [[CATCH1:%.*]], label [[CATCH_FALLTHROUGH:%.*]] +; CHECK: catch1: +; CHECK-NEXT: [[TMP4:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP1]]) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: [[LIVE1:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC2:%.*]] = trunc i64 [[LIVE1]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE3:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC2]], ptr [[TPIDR2_OBJ_LIVE3]], align 2 +; CHECK-NEXT: [[TPI_INT4:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT4]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @bar_shared() +; CHECK-NEXT: to label [[RETURN_SINK_SPLIT:%.*]] unwind label [[LPAD3:%.*]] +; CHECK: catch.fallthrough: +; CHECK-NEXT: [[TMP5:%.*]] = tail call i32 @llvm.eh.typeid.for(ptr nonnull @runtime_error) +; CHECK-NEXT: [[MATCHES1:%.*]] = icmp eq i32 [[TMP2]], [[TMP5]] +; CHECK-NEXT: br i1 [[MATCHES1]], label [[CATCH2:%.*]], label [[EH_RESUME:%.*]] +; CHECK: catch2: +; CHECK-NEXT: [[TMP6:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP1]]) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: [[LIVE5:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC6:%.*]] = trunc i64 [[LIVE5]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE7:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC6]], ptr [[TPIDR2_OBJ_LIVE7]], align 2 +; CHECK-NEXT: [[TPI_INT8:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT8]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @fizz_shared() +; CHECK-NEXT: to label [[RETURN_SINK_SPLIT]] unwind label [[LPAD2:%.*]] +; CHECK: lpad2: +; CHECK-NEXT: [[TMP7:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: cleanup +; CHECK-NEXT: [[LIVE9:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC10:%.*]] = trunc i64 [[LIVE9]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE11:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC10]], ptr [[TPIDR2_OBJ_LIVE11]], align 2 +; CHECK-NEXT: [[TPI_INT12:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT12]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: invoke void @__cxa_end_catch() +; CHECK-NEXT: to label [[CHECK_ZA:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +; CHECK: lpad3: +; CHECK-NEXT: [[TMP8:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: cleanup +; CHECK-NEXT: [[LIVE15:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC16:%.*]] = trunc i64 [[LIVE15]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE17:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC16]], ptr [[TPIDR2_OBJ_LIVE17]], align 2 +; CHECK-NEXT: [[TPI_INT18:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT18]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: invoke void @__cxa_end_catch() +; CHECK-NEXT: to label [[CHECK_ZA13:%.*]] unwind label [[TERMINATE_LPAD]] +; CHECK: return.sink.split: +; CHECK-NEXT: [[RETVAL_PH:%.*]] = phi i32 [ 42, [[CATCH1]] ], [ 23, [[CATCH2]] ] +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: tail call void @__cxa_end_catch() +; CHECK-NEXT: br label [[RETURN]] +; CHECK: return: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i32 [ 42, [[ENTRY:%.*]] ], [ [[RETVAL_PH]], [[RETURN_SINK_SPLIT]] ] +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: ret i32 [[RETVAL]] +; CHECK: check.za: +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: [[TPIDR2:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[TPIDR2]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[RESTORE_ZA:%.*]], label [[EH_RESUME]] +; CHECK: restore.za: +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: br label [[EH_RESUME]] +; CHECK: check.za13: +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: [[TPIDR219:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: [[CMP20:%.*]] = icmp eq i64 [[TPIDR219]], 0 +; CHECK-NEXT: br i1 [[CMP20]], label [[RESTORE_ZA14:%.*]], label [[EH_RESUME]] +; CHECK: restore.za14: +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: br label [[EH_RESUME]] +; CHECK: eh.resume: +; CHECK-NEXT: [[LPAD_VAL13_MERGED:%.*]] = phi { ptr, i32 } [ [[TMP0]], [[CATCH_FALLTHROUGH]] ], [ [[TMP7]], [[CHECK_ZA]] ], [ [[TMP8]], [[CHECK_ZA13]] ], [ [[TMP7]], [[RESTORE_ZA]] ], [ [[TMP8]], [[RESTORE_ZA14]] ] +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.disable() +; CHECK-NEXT: resume { ptr, i32 } [[LPAD_VAL13_MERGED]] +; CHECK: terminate.lpad: +; CHECK-NEXT: [[TMP9:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { ptr, i32 } [[TMP9]], 0 +; CHECK-NEXT: tail call void @__clang_call_terminate(ptr [[TMP10]]) +; CHECK-NEXT: unreachable +; +entry: + invoke void @foo_shared() + to label %return unwind label %lpad1 + +lpad1: + %0 = landingpad { ptr, i32 } + catch ptr @overflow_error + catch ptr @runtime_error + %1 = extractvalue { ptr, i32 } %0, 0 + %2 = extractvalue { ptr, i32 } %0, 1 + %3 = tail call i32 @llvm.eh.typeid.for(ptr nonnull @overflow_error) + %matches = icmp eq i32 %2, %3 + br i1 %matches, label %catch1, label %catch.fallthrough + +catch1: + %4 = tail call ptr @__cxa_begin_catch(ptr %1) + invoke void @bar_shared() + to label %return.sink.split unwind label %lpad3 + +catch.fallthrough: + %5 = tail call i32 @llvm.eh.typeid.for(ptr nonnull @runtime_error) + %matches1 = icmp eq i32 %2, %5 + br i1 %matches1, label %catch2, label %eh.resume + +catch2: + %6 = tail call ptr @__cxa_begin_catch(ptr %1) + invoke void @fizz_shared() + to label %return.sink.split unwind label %lpad2 + +lpad2: + %7 = landingpad { ptr, i32 } + cleanup + invoke void @__cxa_end_catch() + to label %eh.resume unwind label %terminate.lpad + +lpad3: + %8 = landingpad { ptr, i32 } + cleanup + invoke void @__cxa_end_catch() + to label %eh.resume unwind label %terminate.lpad + +return.sink.split: + %retval.ph = phi i32 [ 42, %catch1 ], [ 23, %catch2 ] + tail call void @__cxa_end_catch() + br label %return + +return: + %retval = phi i32 [ 42, %entry ], [ %retval.ph, %return.sink.split ] + ret i32 %retval + +eh.resume: + %lpad.val13.merged = phi { ptr, i32 } [ %0, %catch.fallthrough ], [ %7, %lpad2 ], [ %8, %lpad3 ] + resume { ptr, i32 } %lpad.val13.merged + +terminate.lpad: + %9 = landingpad { ptr, i32 } + catch ptr null + %10 = extractvalue { ptr, i32 } %9, 0 + tail call void @__clang_call_terminate(ptr %10) + unreachable +} + +; +; Try/Catch with throw in try block. Set up & restore lazy-save +; +; void try_throw() { +; try { +; throw 23; +; } catch (...) { +; foo(); +; } +; bar(); +; return; +; } + +define void @try_throw() "aarch64_pstate_za_shared" personality i32 1 { +; CHECK-LABEL: @try_throw( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ:%.*]] = alloca { ptr, i16, [6 x i8] }, align 8 +; CHECK-NEXT: [[N:%.*]] = call i64 @llvm.aarch64.sme.cntsb() +; CHECK-NEXT: [[NN:%.*]] = mul i64 [[N]], [[N]] +; CHECK-NEXT: [[BUFFER:%.*]] = alloca i8, i64 [[NN]], align 16 +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ_BUFFER:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 0 +; CHECK-NEXT: store ptr [[BUFFER]], ptr [[TPIDR2_INVOKE_OBJ_BUFFER]], align 8 +; CHECK-NEXT: [[ZERO:%.*]] = alloca i48, i48 0, align 16 +; CHECK-NEXT: [[RESERVED:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 2 +; CHECK-NEXT: store ptr [[ZERO]], ptr [[RESERVED]], align 8 +; CHECK-NEXT: [[EXCEPTION:%.*]] = tail call ptr @__cxa_allocate_exception(i64 4) +; CHECK-NEXT: store i32 23, ptr [[EXCEPTION]], align 4 +; CHECK-NEXT: [[LIVE:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC:%.*]] = trunc i64 [[LIVE]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC]], ptr [[TPIDR2_OBJ_LIVE]], align 2 +; CHECK-NEXT: [[TPI_INT:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: invoke void @__cxa_throw(ptr nonnull [[EXCEPTION]], ptr nonnull @except, ptr null) +; CHECK-NEXT: to label [[CHECK_ZA:%.*]] unwind label [[LPAD:%.*]] +; CHECK: lpad: +; CHECK-NEXT: [[TMP0:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP1]]) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: [[LIVE1:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC2:%.*]] = trunc i64 [[LIVE1]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE3:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC2]], ptr [[TPIDR2_OBJ_LIVE3]], align 2 +; CHECK-NEXT: [[TPI_INT4:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT4]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @foo_shared() +; CHECK-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD1:%.*]] +; CHECK: invoke.cont: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: tail call void @__cxa_end_catch() +; CHECK-NEXT: tail call void @bar_shared() +; CHECK-NEXT: ret void +; CHECK: lpad1: +; CHECK-NEXT: [[TMP3:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: cleanup +; CHECK-NEXT: [[LIVE7:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC8:%.*]] = trunc i64 [[LIVE7]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE9:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC8]], ptr [[TPIDR2_OBJ_LIVE9]], align 2 +; CHECK-NEXT: [[TPI_INT10:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT10]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: invoke void @__cxa_end_catch() +; CHECK-NEXT: to label [[CHECK_ZA5:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +; CHECK: check.za5: +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: [[TPIDR211:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: [[CMP12:%.*]] = icmp eq i64 [[TPIDR211]], 0 +; CHECK-NEXT: br i1 [[CMP12]], label [[RESTORE_ZA6:%.*]], label [[EH_RESUME:%.*]] +; CHECK: restore.za6: +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: br label [[EH_RESUME]] +; CHECK: eh.resume: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.disable() +; CHECK-NEXT: resume { ptr, i32 } [[TMP3]] +; CHECK: terminate.lpad: +; CHECK-NEXT: [[TMP4:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { ptr, i32 } [[TMP4]], 0 +; CHECK-NEXT: tail call void @__clang_call_terminate(ptr [[TMP5]]) +; CHECK-NEXT: unreachable +; CHECK: check.za: +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: [[TPIDR2:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[TPIDR2]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[RESTORE_ZA:%.*]], label [[UNREACHABLE:%.*]] +; CHECK: restore.za: +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: br label [[UNREACHABLE]] +; CHECK: unreachable: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: unreachable +; +entry: + %exception = tail call ptr @__cxa_allocate_exception(i64 4) + store i32 23, ptr %exception + invoke void @__cxa_throw(ptr nonnull %exception, ptr nonnull @except, ptr null) + to label %unreachable unwind label %lpad + +lpad: + %0 = landingpad { ptr, i32 } + catch ptr null + %1 = extractvalue { ptr, i32 } %0, 0 + %2 = tail call ptr @__cxa_begin_catch(ptr %1) + invoke void @foo_shared() + to label %invoke.cont unwind label %lpad1 + +invoke.cont: + tail call void @__cxa_end_catch() + tail call void @bar_shared() + ret void + +lpad1: + %3 = landingpad { ptr, i32 } + cleanup + invoke void @__cxa_end_catch() + to label %eh.resume unwind label %terminate.lpad + +eh.resume: + resume { ptr, i32 } %3 + +terminate.lpad: + %4 = landingpad { ptr, i32 } + catch ptr null + %5 = extractvalue { ptr, i32 } %4, 0 + tail call void @__clang_call_terminate(ptr %5) + unreachable + +unreachable: + unreachable +} + +; +; Try/Catch with throw in catch block. Set up & restore lazy-save +; +; void throw_except() { +; try { +; foo(); +; } catch (...) { +; throw 23; +; } +; bar(); +; return; +; } + +define void @catch_throw() "aarch64_pstate_za_shared" personality i32 0 { +; CHECK-LABEL: @catch_throw( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ:%.*]] = alloca { ptr, i16, [6 x i8] }, align 8 +; CHECK-NEXT: [[N:%.*]] = call i64 @llvm.aarch64.sme.cntsb() +; CHECK-NEXT: [[NN:%.*]] = mul i64 [[N]], [[N]] +; CHECK-NEXT: [[BUFFER:%.*]] = alloca i8, i64 [[NN]], align 16 +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ_BUFFER:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 0 +; CHECK-NEXT: store ptr [[BUFFER]], ptr [[TPIDR2_INVOKE_OBJ_BUFFER]], align 8 +; CHECK-NEXT: [[ZERO:%.*]] = alloca i48, i48 0, align 16 +; CHECK-NEXT: [[RESERVED:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 2 +; CHECK-NEXT: store ptr [[ZERO]], ptr [[RESERVED]], align 8 +; CHECK-NEXT: [[LIVE:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC:%.*]] = trunc i64 [[LIVE]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC]], ptr [[TPIDR2_OBJ_LIVE]], align 2 +; CHECK-NEXT: [[TPI_INT:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @foo_shared() +; CHECK-NEXT: to label [[TRY_CONT:%.*]] unwind label [[LPAD1:%.*]] +; CHECK: lpad1: +; CHECK-NEXT: [[TMP0:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP1]]) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: [[EXCEPTION:%.*]] = tail call ptr @__cxa_allocate_exception(i64 4) +; CHECK-NEXT: store i32 23, ptr [[EXCEPTION]], align 4 +; CHECK-NEXT: [[LIVE1:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC2:%.*]] = trunc i64 [[LIVE1]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE3:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC2]], ptr [[TPIDR2_OBJ_LIVE3]], align 2 +; CHECK-NEXT: [[TPI_INT4:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT4]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: invoke void @__cxa_throw(ptr nonnull [[EXCEPTION]], ptr nonnull @except, ptr null) +; CHECK-NEXT: to label [[CHECK_ZA:%.*]] unwind label [[LPAD2:%.*]] +; CHECK: lpad2: +; CHECK-NEXT: [[TMP3:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: cleanup +; CHECK-NEXT: [[LIVE7:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC8:%.*]] = trunc i64 [[LIVE7]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE9:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC8]], ptr [[TPIDR2_OBJ_LIVE9]], align 2 +; CHECK-NEXT: [[TPI_INT10:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT10]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: invoke void @__cxa_end_catch() +; CHECK-NEXT: to label [[CHECK_ZA5:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +; CHECK: try.cont: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: tail call void @bar_shared() +; CHECK-NEXT: ret void +; CHECK: check.za5: +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: [[TPIDR211:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: [[CMP12:%.*]] = icmp eq i64 [[TPIDR211]], 0 +; CHECK-NEXT: br i1 [[CMP12]], label [[RESTORE_ZA6:%.*]], label [[EH_RESUME:%.*]] +; CHECK: restore.za6: +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: br label [[EH_RESUME]] +; CHECK: eh.resume: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.disable() +; CHECK-NEXT: resume { ptr, i32 } [[TMP3]] +; CHECK: terminate.lpad: +; CHECK-NEXT: [[TMP4:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { ptr, i32 } [[TMP4]], 0 +; CHECK-NEXT: tail call void @__clang_call_terminate(ptr [[TMP5]]) +; CHECK-NEXT: unreachable +; CHECK: check.za: +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: [[TPIDR2:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[TPIDR2]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[RESTORE_ZA:%.*]], label [[UNREACHABLE:%.*]] +; CHECK: restore.za: +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: br label [[UNREACHABLE]] +; CHECK: unreachable: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: unreachable +; +entry: + invoke void @foo_shared() + to label %try.cont unwind label %lpad1 + +lpad1: + %0 = landingpad { ptr, i32 } + catch ptr null + %1 = extractvalue { ptr, i32 } %0, 0 + %2 = tail call ptr @__cxa_begin_catch(ptr %1) + %exception = tail call ptr @__cxa_allocate_exception(i64 4) + store i32 23, ptr %exception + invoke void @__cxa_throw(ptr nonnull %exception, ptr nonnull @except, ptr null) + to label %unreachable unwind label %lpad2 + +lpad2: + %3 = landingpad { ptr, i32 } + cleanup + invoke void @__cxa_end_catch() + to label %eh.resume unwind label %terminate.lpad + +try.cont: + tail call void @bar_shared() + ret void + +eh.resume: + resume { ptr, i32 } %3 + +terminate.lpad: + %4 = landingpad { ptr, i32 } + catch ptr null + %5 = extractvalue { ptr, i32 } %4, 0 + tail call void @__clang_call_terminate(ptr %5) + unreachable + +unreachable: + unreachable +} + +; +; Nested Try/Catch. Set up & restore lazy-save +; +; void nested() { +; try { +; foo(); +; try { +; bar(); +; } catch (const std::runtime_error& e) { +; fizz(); +; } +; } +; catch (...) { +; buzz(); +; } +; } + +define void @nested() "aarch64_pstate_za_shared" personality i32 0 { +; CHECK-LABEL: @nested( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ:%.*]] = alloca { ptr, i16, [6 x i8] }, align 8 +; CHECK-NEXT: [[N:%.*]] = call i64 @llvm.aarch64.sme.cntsb() +; CHECK-NEXT: [[NN:%.*]] = mul i64 [[N]], [[N]] +; CHECK-NEXT: [[BUFFER:%.*]] = alloca i8, i64 [[NN]], align 16 +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ_BUFFER:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 0 +; CHECK-NEXT: store ptr [[BUFFER]], ptr [[TPIDR2_INVOKE_OBJ_BUFFER]], align 8 +; CHECK-NEXT: [[ZERO:%.*]] = alloca i48, i48 0, align 16 +; CHECK-NEXT: [[RESERVED:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 2 +; CHECK-NEXT: store ptr [[ZERO]], ptr [[RESERVED]], align 8 +; CHECK-NEXT: [[LIVE:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC:%.*]] = trunc i64 [[LIVE]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC]], ptr [[TPIDR2_OBJ_LIVE]], align 2 +; CHECK-NEXT: [[TPI_INT:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @foo_shared() +; CHECK-NEXT: to label [[INVOKE_CONT1:%.*]] unwind label [[LPAD1:%.*]] +; CHECK: invoke.cont1: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: [[LIVE1:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC2:%.*]] = trunc i64 [[LIVE1]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE3:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC2]], ptr [[TPIDR2_OBJ_LIVE3]], align 2 +; CHECK-NEXT: [[TPI_INT4:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT4]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @bar_shared() +; CHECK-NEXT: to label [[TRY_CONT14:%.*]] unwind label [[LPAD2:%.*]] +; CHECK: lpad1: +; CHECK-NEXT: [[TMP0:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 0 +; CHECK-NEXT: br label [[CATCH2:%.*]] +; CHECK: lpad2: +; CHECK-NEXT: [[TMP2:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr @runtime_error +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { ptr, i32 } [[TMP2]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { ptr, i32 } [[TMP2]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = tail call i32 @llvm.eh.typeid.for(ptr nonnull @runtime_error) +; CHECK-NEXT: [[MATCHES:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; CHECK-NEXT: br i1 [[MATCHES]], label [[CATCH1:%.*]], label [[CATCH2]] +; CHECK: catch1: +; CHECK-NEXT: [[TMP6:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP3]]) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: [[LIVE5:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC6:%.*]] = trunc i64 [[LIVE5]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE7:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC6]], ptr [[TPIDR2_OBJ_LIVE7]], align 2 +; CHECK-NEXT: [[TPI_INT8:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT8]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @fizz_shared() +; CHECK-NEXT: to label [[INVOKE_CONT2:%.*]] unwind label [[LPAD3:%.*]] +; CHECK: invoke.cont2: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: [[LIVE9:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC10:%.*]] = trunc i64 [[LIVE9]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE11:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC10]], ptr [[TPIDR2_OBJ_LIVE11]], align 2 +; CHECK-NEXT: [[TPI_INT12:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT12]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: invoke void @__cxa_end_catch() +; CHECK-NEXT: to label [[CHECK_ZA:%.*]] unwind label [[LPAD4:%.*]] +; CHECK: lpad3: +; CHECK-NEXT: [[TMP7:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: [[LIVE15:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC16:%.*]] = trunc i64 [[LIVE15]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE17:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC16]], ptr [[TPIDR2_OBJ_LIVE17]], align 2 +; CHECK-NEXT: [[TPI_INT18:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT18]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: invoke void @__cxa_end_catch() +; CHECK-NEXT: to label [[CHECK_ZA13:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +; CHECK: lpad4: +; CHECK-NEXT: [[TMP8:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: br label [[EH_CLEANUP:%.*]] +; CHECK: check.za13: +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: [[TPIDR219:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: [[CMP20:%.*]] = icmp eq i64 [[TPIDR219]], 0 +; CHECK-NEXT: br i1 [[CMP20]], label [[RESTORE_ZA14:%.*]], label [[EH_CLEANUP]] +; CHECK: restore.za14: +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: br label [[EH_CLEANUP]] +; CHECK: eh.cleanup: +; CHECK-NEXT: [[PN:%.*]] = phi { ptr, i32 } [ [[TMP8]], [[LPAD4]] ], [ [[TMP7]], [[CHECK_ZA13]] ], [ [[TMP7]], [[RESTORE_ZA14]] ] +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: [[EXN_SLOT_0:%.*]] = extractvalue { ptr, i32 } [[PN]], 0 +; CHECK-NEXT: br label [[CATCH2]] +; CHECK: catch2: +; CHECK-NEXT: [[EXN_SLOT_1:%.*]] = phi ptr [ [[EXN_SLOT_0]], [[EH_CLEANUP]] ], [ [[TMP3]], [[LPAD2]] ], [ [[TMP1]], [[LPAD1]] ] +; CHECK-NEXT: [[TMP9:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[EXN_SLOT_1]]) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: [[LIVE21:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC22:%.*]] = trunc i64 [[LIVE21]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE23:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC22]], ptr [[TPIDR2_OBJ_LIVE23]], align 2 +; CHECK-NEXT: [[TPI_INT24:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT24]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @buzz_shared() +; CHECK-NEXT: to label [[INVOKE_CONT3:%.*]] unwind label [[LPAD5:%.*]] +; CHECK: invoke.cont3: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: tail call void @__cxa_end_catch() +; CHECK-NEXT: br label [[TRY_CONT14]] +; CHECK: check.za: +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: [[TPIDR2:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[TPIDR2]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[RESTORE_ZA:%.*]], label [[TRY_CONT14]] +; CHECK: restore.za: +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: br label [[TRY_CONT14]] +; CHECK: try.cont14: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: ret void +; CHECK: lpad5: +; CHECK-NEXT: [[TMP10:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: cleanup +; CHECK-NEXT: [[LIVE27:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC28:%.*]] = trunc i64 [[LIVE27]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE29:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC28]], ptr [[TPIDR2_OBJ_LIVE29]], align 2 +; CHECK-NEXT: [[TPI_INT30:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT30]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: invoke void @__cxa_end_catch() +; CHECK-NEXT: to label [[CHECK_ZA25:%.*]] unwind label [[TERMINATE_LPAD]] +; CHECK: check.za25: +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: [[TPIDR231:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: [[CMP32:%.*]] = icmp eq i64 [[TPIDR231]], 0 +; CHECK-NEXT: br i1 [[CMP32]], label [[RESTORE_ZA26:%.*]], label [[EH_RESUME:%.*]] +; CHECK: restore.za26: +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: br label [[EH_RESUME]] +; CHECK: eh.resume: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.disable() +; CHECK-NEXT: resume { ptr, i32 } [[TMP10]] +; CHECK: terminate.lpad: +; CHECK-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 +; CHECK-NEXT: tail call void @__clang_call_terminate(ptr [[TMP12]]) +; CHECK-NEXT: unreachable +; +entry: + invoke void @foo_shared() + to label %invoke.cont1 unwind label %lpad1 + +invoke.cont1: + invoke void @bar_shared() + to label %try.cont14 unwind label %lpad2 + +lpad1: + %0 = landingpad { ptr, i32 } + catch ptr null + %1 = extractvalue { ptr, i32 } %0, 0 + br label %catch2 + +lpad2: + %2 = landingpad { ptr, i32 } + catch ptr @runtime_error + catch ptr null + %3 = extractvalue { ptr, i32 } %2, 0 + %4 = extractvalue { ptr, i32 } %2, 1 + %5 = tail call i32 @llvm.eh.typeid.for(ptr nonnull @runtime_error) + %matches = icmp eq i32 %4, %5 + br i1 %matches, label %catch1, label %catch2 + +catch1: + %6 = tail call ptr @__cxa_begin_catch(ptr %3) + invoke void @fizz_shared() + to label %invoke.cont2 unwind label %lpad3 + +invoke.cont2: + invoke void @__cxa_end_catch() + to label %try.cont14 unwind label %lpad4 + +lpad3: + %7 = landingpad { ptr, i32 } + catch ptr null + invoke void @__cxa_end_catch() + to label %eh.cleanup unwind label %terminate.lpad + +lpad4: + %8 = landingpad { ptr, i32 } + catch ptr null + br label %eh.cleanup + +eh.cleanup: + %pn = phi { ptr, i32 } [ %8, %lpad4 ], [ %7, %lpad3 ] + %exn.slot.0 = extractvalue { ptr, i32 } %pn, 0 + br label %catch2 + +catch2: + %exn.slot.1 = phi ptr [ %exn.slot.0, %eh.cleanup ], [ %3, %lpad2 ], [ %1, %lpad1 ] + %9 = tail call ptr @__cxa_begin_catch(ptr %exn.slot.1) + invoke void @buzz_shared() + to label %invoke.cont3 unwind label %lpad5 + +invoke.cont3: + tail call void @__cxa_end_catch() + br label %try.cont14 + +try.cont14: + ret void + +lpad5: + %10 = landingpad { ptr, i32 } + cleanup + invoke void @__cxa_end_catch() + to label %eh.resume unwind label %terminate.lpad + +eh.resume: + resume { ptr, i32 } %10 + +terminate.lpad: + %11 = landingpad { ptr, i32 } + catch ptr null + %12 = extractvalue { ptr, i32 } %11, 0 + tail call void @__clang_call_terminate(ptr %12) + unreachable +} + +; void try_condition(bool cond) { +; try { +; if (cond) { +; throw 23; +; } +; bar(); +; } +; catch (...) { +; fizz(); +; } +; } + +define void @conditional_throw(i1 %cond) "aarch64_pstate_za_shared" personality i32 0 { +; CHECK-LABEL: @conditional_throw( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ:%.*]] = alloca { ptr, i16, [6 x i8] }, align 8 +; CHECK-NEXT: [[N:%.*]] = call i64 @llvm.aarch64.sme.cntsb() +; CHECK-NEXT: [[NN:%.*]] = mul i64 [[N]], [[N]] +; CHECK-NEXT: [[BUFFER:%.*]] = alloca i8, i64 [[NN]], align 16 +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ_BUFFER:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 0 +; CHECK-NEXT: store ptr [[BUFFER]], ptr [[TPIDR2_INVOKE_OBJ_BUFFER]], align 8 +; CHECK-NEXT: [[ZERO:%.*]] = alloca i48, i48 0, align 16 +; CHECK-NEXT: [[RESERVED:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 2 +; CHECK-NEXT: store ptr [[ZERO]], ptr [[RESERVED]], align 8 +; CHECK-NEXT: br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[EXCEPTION:%.*]] = tail call ptr @__cxa_allocate_exception(i64 4) +; CHECK-NEXT: store i32 23, ptr [[EXCEPTION]], align 16 +; CHECK-NEXT: [[LIVE:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC:%.*]] = trunc i64 [[LIVE]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC]], ptr [[TPIDR2_OBJ_LIVE]], align 2 +; CHECK-NEXT: [[TPI_INT:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: invoke void @__cxa_throw(ptr nonnull [[EXCEPTION]], ptr nonnull @except, ptr null) +; CHECK-NEXT: to label [[CHECK_ZA:%.*]] unwind label [[LPAD:%.*]] +; CHECK: lpad: +; CHECK-NEXT: [[TMP0:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP1]]) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: [[LIVE1:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC2:%.*]] = trunc i64 [[LIVE1]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE3:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC2]], ptr [[TPIDR2_OBJ_LIVE3]], align 2 +; CHECK-NEXT: [[TPI_INT4:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT4]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @fizz_shared() +; CHECK-NEXT: to label [[INVOKE_CONT2:%.*]] unwind label [[LPAD1:%.*]] +; CHECK: invoke.cont2: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: tail call void @__cxa_end_catch() +; CHECK-NEXT: br label [[TRY_CONT:%.*]] +; CHECK: try.cont: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: ret void +; CHECK: if.end: +; CHECK-NEXT: [[LIVE5:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC6:%.*]] = trunc i64 [[LIVE5]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE7:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC6]], ptr [[TPIDR2_OBJ_LIVE7]], align 2 +; CHECK-NEXT: [[TPI_INT8:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT8]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @bar_shared() +; CHECK-NEXT: to label [[TRY_CONT]] unwind label [[LPAD]] +; CHECK: lpad1: +; CHECK-NEXT: [[TMP3:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: cleanup +; CHECK-NEXT: [[LIVE11:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC12:%.*]] = trunc i64 [[LIVE11]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE13:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC12]], ptr [[TPIDR2_OBJ_LIVE13]], align 2 +; CHECK-NEXT: [[TPI_INT14:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT14]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: invoke void @__cxa_end_catch() +; CHECK-NEXT: to label [[CHECK_ZA9:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +; CHECK: check.za9: +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: [[TPIDR215:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: [[CMP16:%.*]] = icmp eq i64 [[TPIDR215]], 0 +; CHECK-NEXT: br i1 [[CMP16]], label [[RESTORE_ZA10:%.*]], label [[EH_RESUME:%.*]] +; CHECK: restore.za10: +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: br label [[EH_RESUME]] +; CHECK: eh.resume: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.disable() +; CHECK-NEXT: resume { ptr, i32 } [[TMP3]] +; CHECK: terminate.lpad: +; CHECK-NEXT: [[TMP4:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { ptr, i32 } [[TMP4]], 0 +; CHECK-NEXT: tail call void @__clang_call_terminate(ptr [[TMP5]]) +; CHECK-NEXT: unreachable +; CHECK: check.za: +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: [[TPIDR2:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[TPIDR2]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[RESTORE_ZA:%.*]], label [[UNREACHABLE:%.*]] +; CHECK: restore.za: +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: br label [[UNREACHABLE]] +; CHECK: unreachable: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: unreachable +; +entry: + br i1 %cond, label %if.then, label %if.end + +if.then: + %exception = tail call ptr @__cxa_allocate_exception(i64 4) + store i32 23, ptr %exception, align 16 + invoke void @__cxa_throw(ptr nonnull %exception, ptr nonnull @except, ptr null) + to label %unreachable unwind label %lpad + +lpad: + %0 = landingpad { ptr, i32 } + catch ptr null + %1 = extractvalue { ptr, i32 } %0, 0 + %2 = tail call ptr @__cxa_begin_catch(ptr %1) + invoke void @fizz_shared() + to label %invoke.cont2 unwind label %lpad1 + +invoke.cont2: + tail call void @__cxa_end_catch() + br label %try.cont + +try.cont: + ret void + +if.end: + invoke void @bar_shared() + to label %try.cont unwind label %lpad + +lpad1: + %3 = landingpad { ptr, i32 } + cleanup + invoke void @__cxa_end_catch() + to label %eh.resume unwind label %terminate.lpad + +eh.resume: + resume { ptr, i32 } %3 + +terminate.lpad: + %4 = landingpad { ptr, i32 } + catch ptr null + %5 = extractvalue { ptr, i32 } %4, 0 + tail call void @__clang_call_terminate(ptr %5) + unreachable + +unreachable: + unreachable +} + +; +; int foo_throw(bool a) throw(const char *) { +; if (a) { +; bar(); +; return 42; +; } +; fizz(); +; return 23; +; } +; + +define i32 @throw_in_signature(i1 %a) "aarch64_pstate_za_shared" personality i32 1 { +; CHECK-LABEL: @throw_in_signature( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ:%.*]] = alloca { ptr, i16, [6 x i8] }, align 8 +; CHECK-NEXT: [[N:%.*]] = call i64 @llvm.aarch64.sme.cntsb() +; CHECK-NEXT: [[NN:%.*]] = mul i64 [[N]], [[N]] +; CHECK-NEXT: [[BUFFER:%.*]] = alloca i8, i64 [[NN]], align 16 +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ_BUFFER:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 0 +; CHECK-NEXT: store ptr [[BUFFER]], ptr [[TPIDR2_INVOKE_OBJ_BUFFER]], align 8 +; CHECK-NEXT: [[ZERO:%.*]] = alloca i48, i48 0, align 16 +; CHECK-NEXT: [[RESERVED:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 2 +; CHECK-NEXT: store ptr [[ZERO]], ptr [[RESERVED]], align 8 +; CHECK-NEXT: br i1 [[A:%.*]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[LIVE:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC:%.*]] = trunc i64 [[LIVE]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC]], ptr [[TPIDR2_OBJ_LIVE]], align 2 +; CHECK-NEXT: [[TPI_INT:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @bar_shared() +; CHECK-NEXT: to label [[RETURN:%.*]] unwind label [[LPAD:%.*]] +; CHECK: lpad: +; CHECK-NEXT: [[TMP0:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: filter [1 x ptr] [ptr @except] +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 1 +; CHECK-NEXT: [[EHSPEC_FAILS:%.*]] = icmp slt i32 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[EHSPEC_FAILS]], label [[EHSPEC_UNEXPECTED:%.*]], label [[EH_RESUME:%.*]] +; CHECK: ehspec.unexpected: +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 0 +; CHECK-NEXT: tail call void @__cxa_call_unexpected(ptr [[TMP2]]) +; CHECK-NEXT: unreachable +; CHECK: if.end: +; CHECK-NEXT: [[LIVE1:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC2:%.*]] = trunc i64 [[LIVE1]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE3:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC2]], ptr [[TPIDR2_OBJ_LIVE3]], align 2 +; CHECK-NEXT: [[TPI_INT4:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT4]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @fizz_shared() +; CHECK-NEXT: to label [[RETURN]] unwind label [[LPAD]] +; CHECK: return: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i32 [ 42, [[IF_THEN]] ], [ 23, [[IF_END]] ] +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: ret i32 [[RETVAL]] +; CHECK: eh.resume: +; CHECK-NEXT: call void @llvm.aarch64.sme.za.disable() +; CHECK-NEXT: resume { ptr, i32 } [[TMP0]] +; +entry: + br i1 %a, label %if.then, label %if.end + +if.then: + invoke void @bar_shared() + to label %return unwind label %lpad + +lpad: + %0 = landingpad { ptr, i32 } + filter [1 x ptr] [ptr @except] + %1 = extractvalue { ptr, i32 } %0, 1 + %ehspec.fails = icmp slt i32 %1, 0 + br i1 %ehspec.fails, label %ehspec.unexpected, label %eh.resume + +ehspec.unexpected: + %2 = extractvalue { ptr, i32 } %0, 0 + tail call void @__cxa_call_unexpected(ptr %2) #2 + unreachable + +if.end: + invoke void @fizz_shared() + to label %return unwind label %lpad + +return: + %retval = phi i32 [ 42, %if.then ], [ 23, %if.end ] + ret i32 %retval + +eh.resume: + resume { ptr, i32 } %0 +} + +; +; void try_func() try { +; fizz(); +; } +; catch (const std::runtime_error& e) { +; buzz(); +; } +; + +define void @try_func() "aarch64_pstate_za_shared" personality i32 1 { +; CHECK-LABEL: @try_func( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ:%.*]] = alloca { ptr, i16, [6 x i8] }, align 8 +; CHECK-NEXT: [[N:%.*]] = call i64 @llvm.aarch64.sme.cntsb() +; CHECK-NEXT: [[NN:%.*]] = mul i64 [[N]], [[N]] +; CHECK-NEXT: [[BUFFER:%.*]] = alloca i8, i64 [[NN]], align 16 +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ_BUFFER:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 0 +; CHECK-NEXT: store ptr [[BUFFER]], ptr [[TPIDR2_INVOKE_OBJ_BUFFER]], align 8 +; CHECK-NEXT: [[ZERO:%.*]] = alloca i48, i48 0, align 16 +; CHECK-NEXT: [[RESERVED:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 2 +; CHECK-NEXT: store ptr [[ZERO]], ptr [[RESERVED]], align 8 +; CHECK-NEXT: [[LIVE:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC:%.*]] = trunc i64 [[LIVE]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC]], ptr [[TPIDR2_OBJ_LIVE]], align 2 +; CHECK-NEXT: [[TPI_INT:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @fizz_shared() +; CHECK-NEXT: to label [[TRY_CONT:%.*]] unwind label [[LPAD:%.*]] +; CHECK: lpad: +; CHECK-NEXT: [[TMP0:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr @runtime_error +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = tail call i32 @llvm.eh.typeid.for(ptr nonnull @runtime_error) +; CHECK-NEXT: [[MATCHES:%.*]] = icmp eq i32 [[TMP1]], [[TMP2]] +; CHECK-NEXT: br i1 [[MATCHES]], label [[CATCH:%.*]], label [[EH_RESUME:%.*]] +; CHECK: catch: +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP3]]) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: [[LIVE1:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC2:%.*]] = trunc i64 [[LIVE1]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE3:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC2]], ptr [[TPIDR2_OBJ_LIVE3]], align 2 +; CHECK-NEXT: [[TPI_INT4:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT4]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @buzz_shared() +; CHECK-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD1:%.*]] +; CHECK: invoke.cont: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: tail call void @__cxa_end_catch() +; CHECK-NEXT: br label [[TRY_CONT]] +; CHECK: try.cont: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: ret void +; CHECK: lpad1: +; CHECK-NEXT: [[TMP5:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: cleanup +; CHECK-NEXT: [[LIVE5:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC6:%.*]] = trunc i64 [[LIVE5]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE7:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC6]], ptr [[TPIDR2_OBJ_LIVE7]], align 2 +; CHECK-NEXT: [[TPI_INT8:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT8]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: invoke void @__cxa_end_catch() +; CHECK-NEXT: to label [[CHECK_ZA:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +; CHECK: check.za: +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: [[TPIDR2:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[TPIDR2]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[RESTORE_ZA:%.*]], label [[EH_RESUME]] +; CHECK: restore.za: +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: br label [[EH_RESUME]] +; CHECK: eh.resume: +; CHECK-NEXT: [[LPAD_VAL6_MERGED:%.*]] = phi { ptr, i32 } [ [[TMP0]], [[LPAD]] ], [ [[TMP5]], [[CHECK_ZA]] ], [ [[TMP5]], [[RESTORE_ZA]] ] +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.disable() +; CHECK-NEXT: resume { ptr, i32 } [[LPAD_VAL6_MERGED]] +; CHECK: terminate.lpad: +; CHECK-NEXT: [[TMP6:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { ptr, i32 } [[TMP6]], 0 +; CHECK-NEXT: tail call void @__clang_call_terminate(ptr [[TMP7]]) +; CHECK-NEXT: unreachable +; +entry: + invoke void @fizz_shared() + to label %try.cont unwind label %lpad + +lpad: + %0 = landingpad { ptr, i32 } + catch ptr @runtime_error + %1 = extractvalue { ptr, i32 } %0, 1 + %2 = tail call i32 @llvm.eh.typeid.for(ptr nonnull @runtime_error) + %matches = icmp eq i32 %1, %2 + br i1 %matches, label %catch, label %eh.resume + +catch: + %3 = extractvalue { ptr, i32 } %0, 0 + %4 = tail call ptr @__cxa_begin_catch(ptr %3) #4 + invoke void @buzz_shared() + to label %invoke.cont unwind label %lpad1 + +invoke.cont: + tail call void @__cxa_end_catch() + br label %try.cont + +try.cont: + ret void + +lpad1: + %5 = landingpad { ptr, i32 } + cleanup + invoke void @__cxa_end_catch() + to label %eh.resume unwind label %terminate.lpad + +eh.resume: + %lpad.val6.merged = phi { ptr, i32 } [ %0, %lpad ], [ %5, %lpad1 ] + resume { ptr, i32 } %lpad.val6.merged + +terminate.lpad: + %6 = landingpad { ptr, i32 } + catch ptr null + %7 = extractvalue { ptr, i32 } %6, 0 + tail call void @__clang_call_terminate(ptr %7) + unreachable +} + +declare i32 @llvm.eh.typeid.for(ptr) +declare ptr @__cxa_begin_catch(ptr) +declare void @__cxa_end_catch() +declare ptr @__cxa_allocate_exception(i64) +declare void @__cxa_throw(ptr, ptr, ptr) +declare void @__cxa_call_unexpected(ptr) noreturn +declare void @__clang_call_terminate(ptr) noreturn + +; CHECK: attributes #0 = { "aarch64_pstate_za_shared" } +; CHECK: attributes #1 = { "aarch64_pstate_za_new" } +; CHECK: attributes #2 = { "aarch64_expanded_pstate_za" "aarch64_pstate_sm_enabled" "aarch64_pstate_za_shared" } +; CHECK: attributes #3 = { "aarch64_expanded_pstate_za" "aarch64_pstate_sm_compatible" "aarch64_pstate_za_shared" } +; CHECK: attributes #4 = { "aarch64_expanded_pstate_za" "aarch64_pstate_za_shared" } +; CHECK: attributes #5 = { "aarch64_expanded_pstate_za" "aarch64_pstate_za_new" } +; CHECK: attributes #6 = { nounwind memory(none) } +; CHECK: attributes #7 = { noreturn } +; CHECK: attributes #8 = { nocallback nofree nosync nounwind willreturn memory(none) } +; CHECK: attributes #9 = { nocallback nofree nosync nounwind willreturn } +; CHECK: attributes #10 = { "aarch64_pstate_sm_compatible" "aarch64_pstate_za_shared" } +; CHECK: attributes #11 = { "aarch64_pstate_sm_compatible" "aarch64_pstate_za_preserved" } diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-get-live-za-slices.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-get-live-za-slices.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-get-live-za-slices.ll @@ -0,0 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s + +define i64 @sme_get_live_slices() { +; CHECK-LABEL: sme_get_live_slices: +; CHECK: // %bb.0: +; CHECK-NEXT: rdsvl x0, #1 +; CHECK-NEXT: ret + %slices = call i64 @llvm.aarch64.sme.get.live.za.slices() + ret i64 %slices +} + +declare i64 @llvm.aarch64.sme.get.live.za.slices() diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll --- a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll +++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll @@ -329,6 +329,33 @@ ret void } +; Test that we can store to the stack as well +; (this requires adding these opcodes to AArch64InstrInfo::getMemOpInfo) +define void @test_memopinfo_sme_ldr() nounwind { +; CHECK-LABEL: test_memopinfo_sme_ldr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-16 +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: addvl x9, x8, #15 +; CHECK-NEXT: ldr za[w12, 0], [x8] +; CHECK-NEXT: addvl x8, x8, #16 +; CHECK-NEXT: ldr za[w12, 0], [x9] +; CHECK-NEXT: ldr za[w12, 0], [x8] +; CHECK-NEXT: addvl sp, sp, #16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %ptr = alloca , i64 16, align 16 + %ptr15 = getelementptr , ptr %ptr, i64 15 + %ptr16 = getelementptr , ptr %ptr, i64 16 + call void @llvm.aarch64.sme.ldr(i32 0, ptr %ptr) + call void @llvm.aarch64.sme.ldr(i32 0, ptr %ptr15) + call void @llvm.aarch64.sme.ldr(i32 0, ptr %ptr16) + ret void +} + declare void @llvm.aarch64.sme.ld1b.horiz(, ptr, i32, i32) declare void @llvm.aarch64.sme.ld1h.horiz(, ptr, i32, i32) diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll --- a/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll +++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll @@ -329,6 +329,33 @@ ret void } +; Test that we can store to the stack as well +; (this requires adding these opcodes to AArch64InstrInfo::getMemOpInfo) +define void @test_memopinfo_sme_str() nounwind { +; CHECK-LABEL: test_memopinfo_sme_str: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-16 +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: addvl x9, x8, #15 +; CHECK-NEXT: str za[w12, 0], [x8] +; CHECK-NEXT: addvl x8, x8, #16 +; CHECK-NEXT: str za[w12, 0], [x9] +; CHECK-NEXT: str za[w12, 0], [x8] +; CHECK-NEXT: addvl sp, sp, #16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %ptr = alloca , i64 16, align 16 + %ptr15 = getelementptr , ptr %ptr, i64 15 + %ptr16 = getelementptr , ptr %ptr, i64 16 + call void @llvm.aarch64.sme.str(i32 0, ptr %ptr) + call void @llvm.aarch64.sme.str(i32 0, ptr %ptr15) + call void @llvm.aarch64.sme.str(i32 0, ptr %ptr16) + ret void +} + declare void @llvm.aarch64.sme.st1b.horiz(, ptr, i32, i32) declare void @llvm.aarch64.sme.st1h.horiz(, ptr, i32, i32) declare void @llvm.aarch64.sme.st1w.horiz(, ptr, i32, i32) diff --git a/llvm/test/CodeGen/AArch64/sme-invoke-resume-pstatesm.ll b/llvm/test/CodeGen/AArch64/sme-invoke-resume-pstatesm.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-invoke-resume-pstatesm.ll @@ -0,0 +1,63 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s + +define void @unconditional_smstart() nounwind { +; CHECK-LABEL: unconditional_smstart: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.invoke.resume.pstatesm(i64 1) + ret void +} + +; FIXME: This case can be optimised away, since we know the condition +; will evaluate to false. +define void @conditional_smstart() nounwind { +; CHECK-LABEL: conditional_smstart: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: tbz xzr, #0, .LBB1_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB1_2: +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.invoke.resume.pstatesm(i64 0) + ret void +} + +define void @conditional_smstart_with_entry_val(i64 %pstate) nounwind { +; CHECK-LABEL: conditional_smstart_with_entry_val: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: tbz x0, #0, .LBB2_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB2_2: +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.invoke.resume.pstatesm(i64 %pstate) + ret void +} + +declare void @llvm.aarch64.sme.invoke.resume.pstatesm(i64) diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll --- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll +++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll @@ -11,14 +11,15 @@ ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mul x8, x8, x8 -; CHECK-NEXT: sub x9, x9, x8 -; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: msub x8, x9, x9, x8 +; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: sub x10, x29, #16 -; CHECK-NEXT: stur x9, [x29, #-16] -; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh w9, [x29, #-8] ; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za @@ -44,12 +45,13 @@ ; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mul x19, x8, x8 ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: sub x8, x8, x19 +; CHECK-NEXT: rdsvl x19, #1 +; CHECK-NEXT: msub x8, x19, x19, x8 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: sub x20, x29, #16 +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: sturh wzr, [x29, #-6] ; CHECK-NEXT: stur x8, [x29, #-16] ; CHECK-NEXT: sturh w19, [x29, #-8] ; CHECK-NEXT: msr TPIDR2_EL0, x20 @@ -89,14 +91,15 @@ ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mul x8, x8, x8 -; CHECK-NEXT: sub x9, x9, x8 -; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: msub x8, x9, x9, x8 +; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: sub x10, x29, #16 -; CHECK-NEXT: stur x9, [x29, #-16] -; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh w9, [x29, #-8] ; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: smstart za @@ -126,14 +129,15 @@ ; CHECK-NEXT: add x29, sp, #64 ; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mul x8, x8, x8 -; CHECK-NEXT: sub x9, x9, x8 -; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: msub x8, x9, x9, x8 +; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: sub x10, x29, #80 -; CHECK-NEXT: stur x9, [x29, #-80] -; CHECK-NEXT: sturh w8, [x29, #-72] +; CHECK-NEXT: stur wzr, [x29, #-68] +; CHECK-NEXT: sturh wzr, [x29, #-70] +; CHECK-NEXT: stur x8, [x29, #-80] +; CHECK-NEXT: sturh w9, [x29, #-72] ; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 diff --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll @@ -0,0 +1,159 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -stop-after=aarch64-sme-peephole-opt < %s | FileCheck %s + +declare void @normal_callee(); +declare void @streaming_callee() "aarch64_pstate_sm_enabled"; +declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible"; + +define void @streaming_caller_normal_callee() nounwind "aarch64_pstate_sm_enabled" { + ; CHECK-LABEL: name: streaming_caller_normal_callee + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp + ; CHECK-NEXT: BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv + ; CHECK-NEXT: RET_ReallyLR + call void @normal_callee(); + call void @normal_callee(); + call void @normal_callee(); + ret void; +} + +define void @locally_streaming_caller_normal_callee() nounwind "aarch64_pstate_sm_body" { + ; CHECK-LABEL: name: locally_streaming_caller_normal_callee + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: RET_ReallyLR + call void @normal_callee(); + call void @normal_callee(); + call void @normal_callee(); + ret void; +} + +; FIXME: This test no longer results in any conditional smstart/stop pairs being optimised after adding a check that the pstate.sm register must match. +define void @streaming_compatible_caller_normal_callee() nounwind "aarch64_pstate_sm_compatible" { + ; CHECK-LABEL: name: streaming_compatible_caller_normal_callee + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[ANDXri:%[0-9]+]]:gpr64common = ANDXri [[COPY]], 4096 + ; CHECK-NEXT: MSRpstatePseudo 1, 0, [[ANDXri]], 0, csr_aarch64_smstartstop, implicit-def $sp + ; CHECK-NEXT: BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 1, [[ANDXri]], 0, csr_aarch64_smstartstop + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[ANDXri1:%[0-9]+]]:gpr64common = ANDXri [[COPY1]], 4096 + ; CHECK-NEXT: MSRpstatePseudo 1, 0, [[ANDXri1]], 0, csr_aarch64_smstartstop, implicit-def $sp + ; CHECK-NEXT: BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 1, [[ANDXri1]], 0, csr_aarch64_smstartstop + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[ANDXri2:%[0-9]+]]:gpr64common = ANDXri [[COPY2]], 4096 + ; CHECK-NEXT: MSRpstatePseudo 1, 0, [[ANDXri2]], 0, csr_aarch64_smstartstop, implicit-def $sp + ; CHECK-NEXT: BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 1, [[ANDXri2]], 0, csr_aarch64_smstartstop + ; CHECK-NEXT: RET_ReallyLR + call void @normal_callee(); + call void @normal_callee(); + call void @normal_callee(); + ret void; +} + +define void @normal_caller_streaming_callee() nounwind { + ; CHECK-LABEL: name: normal_caller_streaming_callee + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp + ; CHECK-NEXT: BL @streaming_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL @streaming_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL @streaming_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv + ; CHECK-NEXT: RET_ReallyLR + call void @streaming_callee(); + call void @streaming_callee(); + call void @streaming_callee(); + ret void; +} + +define void @streaming_compatible_caller_mixed_callees() nounwind "aarch64_pstate_sm_compatible" { + ; CHECK-LABEL: name: streaming_compatible_caller_mixed_callees + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[ANDXri:%[0-9]+]]:gpr64common = ANDXri [[COPY]], 4096 + ; CHECK-NEXT: MSRpstatePseudo 1, 1, [[ANDXri]], 1, csr_aarch64_smstartstop, implicit-def $sp + ; CHECK-NEXT: BL @streaming_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 0, [[ANDXri]], 1, csr_aarch64_smstartstop + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[ANDXri1:%[0-9]+]]:gpr64common = ANDXri [[COPY1]], 4096 + ; CHECK-NEXT: MSRpstatePseudo 1, 0, [[ANDXri1]], 0, csr_aarch64_smstartstop, implicit-def $sp + ; CHECK-NEXT: BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 1, [[ANDXri1]], 0, csr_aarch64_smstartstop + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[ANDXri2:%[0-9]+]]:gpr64common = ANDXri [[COPY2]], 4096 + ; CHECK-NEXT: MSRpstatePseudo 1, 1, [[ANDXri2]], 1, csr_aarch64_smstartstop, implicit-def $sp + ; CHECK-NEXT: BL @streaming_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 0, [[ANDXri2]], 1, csr_aarch64_smstartstop + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[ANDXri3:%[0-9]+]]:gpr64common = ANDXri [[COPY3]], 4096 + ; CHECK-NEXT: MSRpstatePseudo 1, 0, [[ANDXri3]], 0, csr_aarch64_smstartstop, implicit-def $sp + ; CHECK-NEXT: BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 1, [[ANDXri3]], 0, csr_aarch64_smstartstop + ; CHECK-NEXT: RET_ReallyLR + call void @streaming_callee(); + call void @normal_callee(); + call void @streaming_callee(); + call void @normal_callee(); + ret void; +} diff --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.mir b/llvm/test/CodeGen/AArch64/sme-peephole-opts.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.mir @@ -0,0 +1,172 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +# RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -run-pass=aarch64-sme-peephole-opt -verify-machineinstrs %s -o - | FileCheck %s +--- | + ; ModuleID = '' + source_filename = "" + target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + target triple = "aarch64-unknown-linux-gnu" + + declare void @normal_callee() #0 + + ; Function Attrs: nounwind + define void @streaming_compatible_smstartstop_match() #1 { + call void @normal_callee() + call void @normal_callee() + ret void + } + + ; Function Attrs: nounwind + define void @streaming_compatible_smstartstop_regmask() #1 { + call void @normal_callee() + call void @normal_callee() + ret void + } + + ; Function Attrs: nounwind + define void @streaming_compatible_smstartstop_physical_regs() #1 { + call void @normal_callee() + call void @normal_callee() + ret void + } + + attributes #0 = { "target-features"="+sme" } + attributes #1 = { nounwind "aarch64_expanded_pstate_za" "aarch64_pstate_sm_compatible" "target-features"="+sme" } + +... +--- +# smstart/smstop pair can be removed by aarch64-sme-peephole-opt as the virtual registers for pstate.sm match + +name: streaming_compatible_smstartstop_match +alignment: 4 +tracksRegLiveness: true +liveins: [] +body: | + bb.0 (%ir-block.0): + ; CHECK-LABEL: name: streaming_compatible_smstartstop_match + ; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 0, [[COPY]], 0, csr_aarch64_smstartstop, implicit-def $sp + ; CHECK-NEXT: BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 1, [[COPY]], 0, csr_aarch64_smstartstop + ; CHECK-NEXT: RET_ReallyLR + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + %0:gpr64 = COPY $x0 + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + MSRpstatePseudo 1, 0, %0, 0, csr_aarch64_smstartstop, implicit-def $sp + BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + MSRpstatePseudo 1, 1, %0, 0, csr_aarch64_smstartstop + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + MSRpstatePseudo 1, 0, %0, 0, csr_aarch64_smstartstop, implicit-def $sp + BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + MSRpstatePseudo 1, 1, %0, 0, csr_aarch64_smstartstop + RET_ReallyLR +... +--- +# No smstart/smstop pairs should be removed by aarch64-sme-peephole-opt as the register masks do not match + +name: streaming_compatible_smstartstop_regmask +alignment: 4 +tracksRegLiveness: true +liveins: [] +body: | + bb.0 (%ir-block.0): + ; CHECK-LABEL: name: streaming_compatible_smstartstop_regmask + ; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 0, [[COPY]], 0, csr_aarch64_smstartstop, implicit-def $sp + ; CHECK-NEXT: BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 1, [[COPY]], 0, csr_aarch64_smstartstop + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 0, [[COPY]], 0, csr_aarch64_aapcs, implicit-def $sp + ; CHECK-NEXT: BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 1, [[COPY]], 0, csr_aarch64_smstartstop + ; CHECK-NEXT: RET_ReallyLR + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + %0:gpr64 = COPY $x0 + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + MSRpstatePseudo 1, 0, %0, 0, csr_aarch64_smstartstop, implicit-def $sp + BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + MSRpstatePseudo 1, 1, %0, 0, csr_aarch64_smstartstop + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + MSRpstatePseudo 1, 0, %0, 0, csr_aarch64_aapcs, implicit-def $sp + BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + MSRpstatePseudo 1, 1, %0, 0, csr_aarch64_smstartstop + RET_ReallyLR +... +--- +# No smstart/smstop pairs should be removed by aarch64-sme-peephole-opt as the value of pstate.sm is held in a physical register + +name: streaming_compatible_smstartstop_physical_regs +alignment: 4 +tracksRegLiveness: true +liveins: [] +body: | + bb.0 (%ir-block.0): + ; CHECK-LABEL: name: streaming_compatible_smstartstop_physical_regs + ; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 0, undef $x7, 0, csr_aarch64_smstartstop, implicit-def $sp + ; CHECK-NEXT: BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 1, undef $x7, 0, csr_aarch64_smstartstop + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 0, undef $x7, 0, csr_aarch64_smstartstop, implicit-def $sp + ; CHECK-NEXT: BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 1, undef $x7, 0, csr_aarch64_smstartstop + ; CHECK-NEXT: RET_ReallyLR + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + MSRpstatePseudo 1, 0, undef $x7, 0, csr_aarch64_smstartstop, implicit-def $sp + BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + MSRpstatePseudo 1, 1, undef $x7, 0, csr_aarch64_smstartstop + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + MSRpstatePseudo 1, 0, undef $x7, 0, csr_aarch64_smstartstop, implicit-def $sp + BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + MSRpstatePseudo 1, 1, undef $x7, 0, csr_aarch64_smstartstop + RET_ReallyLR +... diff --git a/llvm/test/CodeGen/AArch64/sme-read-write-tpidr2.ll b/llvm/test/CodeGen/AArch64/sme-read-write-tpidr2.ll --- a/llvm/test/CodeGen/AArch64/sme-read-write-tpidr2.ll +++ b/llvm/test/CodeGen/AArch64/sme-read-write-tpidr2.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64 -mattr=+sme < %s | FileCheck %s +; RUN: llc -mtriple=aarch64 -mattr=+sme -verify-machineinstrs < %s | FileCheck %s define i64 @get_tpidr2_el0() { ; CHECK-LABEL: get_tpidr2_el0: diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-abi.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-abi.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-shared-za-abi.ll @@ -0,0 +1,225 @@ +; RUN: opt -S -mtriple=aarch64-linux-gnu -opaque-pointers -aarch64-sme-abi %s | FileCheck %s +; RUN: opt -S -mtriple=aarch64-linux-gnu -opaque-pointers -aarch64-sme-abi -aarch64-sme-abi %s | FileCheck %s +; Check that the test also passes when not in opaque-pointer mode. +; RUN: opt -S -mtriple=aarch64-linux-gnu -aarch64-sme-abi %s -o /dev/null + +; XFAIL: * + +declare void @private_za_callee(); +declare void @shared_za_callee() "aarch64_pstate_za_shared" +declare void @new_za_callee() "aarch64_pstate_za_new" +declare void @preserved_za_callee() "aarch64_pstate_za_preserved" +declare i64 @foo(i64); + +declare float @llvm.cos.f32(float) + +; CHECK: type { ptr, i16, [6 x i8] } + +; Shared ZA Caller, Private ZA Callee + +define void @shared_za_caller() "aarch64_pstate_za_shared" { +; CHECK-LABEL: define {{[^@]+}}@shared_za_caller() #4 { +; CHECK-NEXT: %tpidr2.call.obj = alloca %tpidr2_ty, align 8 +; CHECK-NEXT: %N = call i64 @llvm.aarch64.sme.cntsb() +; CHECK-NEXT: %NN = mul i64 %N, %N +; CHECK-NEXT: %buffer = alloca i8, i64 %NN, align 16 +; CHECK-NEXT: %tpidr2.call.obj.buffer = getelementptr %tpidr2_ty, ptr %tpidr2.call.obj, i64 0, i32 0 +; CHECK-NEXT: store ptr %buffer, ptr %tpidr2.call.obj.buffer, align 8 +; CHECK-NEXT: %live = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: %live.trunc = trunc i64 %live to i16 +; CHECK-NEXT: %tpidr2.obj.live = getelementptr %tpidr2_ty, ptr %tpidr2.call.obj, i64 0, i32 1 +; CHECK-NEXT: store i16 %live.trunc, ptr %tpidr2.obj.live, align 2 +; CHECK-NEXT: %tpi.int = ptrtoint ptr %tpidr2.call.obj to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 %tpi.int) +; CHECK-NEXT: call void @private_za_callee() +; CHECK-NEXT: call void @llvm.aarch64.sme.start.pstateza() +; CHECK-NEXT: %tpidr2 = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: %cmp = icmp eq i64 %tpidr2, 0 +; CHECK-NEXT: br i1 %cmp, label %restore.za, label %resume +; CHECK: restore.za: +; CHECK-NEXT: call void @llvm.aarch64.sme.tpidr2.restore(ptr %tpidr2.call.obj) +; CHECK-NEXT: br label %resume +; CHECK: resume: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: ret void +; + call void @private_za_callee() + ret void +} + +; Shared ZA Caller, Private ZA Callees + +define i64 @shared_za_caller_multiple_callees(i64 %a) "aarch64_pstate_za_shared" { +; CHECK-LABEL: define {{[^@]+}}@shared_za_caller_multiple_callees(i64 %a) #4 { +; CHECK-NEXT: %tpidr2.call.obj = alloca %tpidr2_ty, align 8 +; CHECK-NEXT: %N = call i64 @llvm.aarch64.sme.cntsb() +; CHECK-NEXT: %NN = mul i64 %N, %N +; CHECK-NEXT: %buffer = alloca i8, i64 %NN, align 16 +; CHECK-NEXT: %tpidr2.call.obj.buffer = getelementptr %tpidr2_ty, ptr %tpidr2.call.obj, i64 0, i32 0 +; CHECK-NEXT: store ptr %buffer, ptr %tpidr2.call.obj.buffer, align 8 +; CHECK-NEXT: %live = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: %live.trunc = trunc i64 %live to i16 +; CHECK-NEXT: %tpidr2.obj.live = getelementptr %tpidr2_ty, ptr %tpidr2.call.obj, i64 0, i32 1 +; CHECK-NEXT: store i16 %live.trunc, ptr %tpidr2.obj.live, align 2 +; CHECK-NEXT: %tpi.int = ptrtoint ptr %tpidr2.call.obj to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 %tpi.int) +; CHECK-NEXT: %b = call i64 @foo(i64 %a) +; CHECK-NEXT: call void @llvm.aarch64.sme.start.pstateza() +; CHECK-NEXT: %tpidr2 = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: %cmp = icmp eq i64 %tpidr2, 0 +; CHECK-NEXT: br i1 %cmp, label %restore.za, label %resume +; CHECK: restore.za: +; CHECK-NEXT: call void @llvm.aarch64.sme.tpidr2.restore(ptr %tpidr2.call.obj) +; CHECK-NEXT: br label %resume +; CHECK: resume: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: %sum = add i64 %a, %b +; CHECK-NEXT: %live3 = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: %live.trunc4 = trunc i64 %live3 to i16 +; CHECK-NEXT: %tpidr2.obj.live5 = getelementptr %tpidr2_ty, ptr %tpidr2.call.obj, i64 0, i32 1 +; CHECK-NEXT: store i16 %live.trunc4, ptr %tpidr2.obj.live5, align 2 +; CHECK-NEXT: %tpi.int6 = ptrtoint ptr %tpidr2.call.obj to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 %tpi.int6) +; CHECK-NEXT: %c = call i64 @foo(i64 %sum) +; CHECK-NEXT: call void @llvm.aarch64.sme.start.pstateza() +; CHECK-NEXT: %tpidr27 = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: %cmp8 = icmp eq i64 %tpidr27, 0 +; CHECK-NEXT: br i1 %cmp8, label %restore.za2, label %resume1 +; CHECK: restore.za2: +; CHECK-NEXT: call void @llvm.aarch64.sme.tpidr2.restore(ptr %tpidr2.call.obj) +; CHECK-NEXT: br label %resume1 +; CHECK: resume1: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: %res = mul i64 %sum, %c +; CHECK-NEXT: ret i64 %res +; + %b = call i64 @foo(i64 %a) + %sum = add i64 %a, %b + %c = call i64 @foo(i64 %sum) + %res = mul i64 %sum, %c + ret i64 %res +} + +; Shared ZA Caller, New ZA Callee + +define void @shared_za_new_za_callee() "aarch64_pstate_za_shared" { +; CHECK-LABEL: define {{[^@]+}}@shared_za_new_za_callee() #4 { +; CHECK-NEXT: %tpidr2.call.obj = alloca %tpidr2_ty, align 8 +; CHECK-NEXT: %N = call i64 @llvm.aarch64.sme.cntsb() +; CHECK-NEXT: %NN = mul i64 %N, %N +; CHECK-NEXT: %buffer = alloca i8, i64 %NN, align 16 +; CHECK-NEXT: %tpidr2.call.obj.buffer = getelementptr %tpidr2_ty, ptr %tpidr2.call.obj, i64 0, i32 0 +; CHECK-NEXT: store ptr %buffer, ptr %tpidr2.call.obj.buffer, align 8 +; CHECK-NEXT: %live = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: %live.trunc = trunc i64 %live to i16 +; CHECK-NEXT: %tpidr2.obj.live = getelementptr %tpidr2_ty, ptr %tpidr2.call.obj, i64 0, i32 1 +; CHECK-NEXT: store i16 %live.trunc, ptr %tpidr2.obj.live, align 2 +; CHECK-NEXT: %tpi.int = ptrtoint ptr %tpidr2.call.obj to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 %tpi.int) +; CHECK-NEXT: call void @new_za_callee() +; CHECK-NEXT: call void @llvm.aarch64.sme.start.pstateza() +; CHECK-NEXT: %tpidr2 = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: %cmp = icmp eq i64 %tpidr2, 0 +; CHECK-NEXT: br i1 %cmp, label %restore.za, label %resume +; CHECK: restore.za: +; CHECK-NEXT: call void @llvm.aarch64.sme.tpidr2.restore(ptr %tpidr2.call.obj) +; CHECK-NEXT: br label %resume +; CHECK: resume: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: ret void +; + call void @new_za_callee() + ret void +} + +define void @shared_za_streaming_compatible_caller_private_za_callee() "aarch64_pstate_za_shared" "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: define {{[^@]+}}@shared_za_streaming_compatible_caller_private_za_callee() #5 { +; CHECK-NEXT: %tpidr2.call.obj = alloca %tpidr2_ty, align 8 +; CHECK-NEXT: %N = call i64 @llvm.aarch64.sme.cntsb() +; CHECK-NEXT: %NN = mul i64 %N, %N +; CHECK-NEXT: %buffer = alloca i8, i64 %NN, align 16 +; CHECK-NEXT: %tpidr2.call.obj.buffer = getelementptr %tpidr2_ty, ptr %tpidr2.call.obj, i64 0, i32 0 +; CHECK-NEXT: store ptr %buffer, ptr %tpidr2.call.obj.buffer, align 8 +; CHECK-NEXT: %live = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: %live.trunc = trunc i64 %live to i16 +; CHECK-NEXT: %tpidr2.obj.live = getelementptr %tpidr2_ty, ptr %tpidr2.call.obj, i64 0, i32 1 +; CHECK-NEXT: store i16 %live.trunc, ptr %tpidr2.obj.live, align 2 +; CHECK-NEXT: %tpi.int = ptrtoint ptr %tpidr2.call.obj to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 %tpi.int) +; CHECK-NEXT: call void @private_za_callee() +; CHECK-NEXT: call void @llvm.aarch64.sme.start.pstateza() +; CHECK-NEXT: %tpidr2 = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: %cmp = icmp eq i64 %tpidr2, 0 +; CHECK-NEXT: br i1 %cmp, label %restore.za, label %resume +; CHECK: restore.za: +; CHECK-NEXT: call void @llvm.aarch64.sme.tpidr2.restore(ptr %tpidr2.call.obj) +; CHECK-NEXT: br label %resume +; CHECK: resume: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: ret void +; + call void @private_za_callee() + ret void +} + +; Shared ZA Caller, Shared ZA Callee (Lazy-save not required) + +define void @shared_za_shared_za_callee() "aarch64_pstate_za_shared" { +; CHECK-LABEL: define {{[^@]+}}@shared_za_shared_za_callee() #0 { +; CHECK-NEXT: call void @shared_za_callee() +; CHECK-NEXT: ret void +; + call void @shared_za_callee() + ret void +} + +; Ensure we also check the attribute on the call itself (not just from the called function) +define void @shared_za_shared_za_callee_from_ptr(ptr %fnptr) "aarch64_pstate_za_shared" { +; CHECK-LABEL: define {{[^@]+}}@shared_za_shared_za_callee_from_ptr(ptr %fnptr) #0 { +; CHECK-NEXT: call void %fnptr() #0 +; CHECK-NEXT: ret void +; + call void %fnptr() "aarch64_pstate_za_shared" + ret void +} + +; Shared ZA Caller, Preserved ZA Callee (Lazy-save not required) + +define void @shared_za_caller_preserved_za_callee() "aarch64_pstate_za_shared" { +; CHECK-LABEL: define {{[^@]+}}@shared_za_caller_preserved_za_callee() #0 { +; CHECK-NEXT: call void @preserved_za_callee() +; CHECK-NEXT: ret void +; + call void @preserved_za_callee() + ret void +} + +; Shared ZA Caller with Intrinsic Call (Lazy-save not required) + +define float @shared_za_caller_with_intrinsic(ptr %a) "aarch64_pstate_za_shared" { +; CHECK-LABEL: define {{[^@]+}}@shared_za_caller_with_intrinsic(ptr %a) #0 { +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull [[A:%.*]]) +; CHECK-NEXT: [[RES:%.*]] = load float, ptr [[A]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull [[A]]) +; CHECK-NEXT: ret float [[RES]] + call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull %a) + %res = load float, ptr %a + call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull %a) + ret float %res +} + +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) + +;. +; CHECK: attributes #0 = { "aarch64_pstate_za_shared" } +; CHECK: attributes #1 = { "aarch64_pstate_za_new" } +; CHECK: attributes #2 = { "aarch64_pstate_za_preserved" } +; CHECK: attributes #3 = { nocallback nofree nosync nounwind readnone speculatable willreturn } +; CHECK: attributes #4 = { "aarch64_expanded_pstate_za" "aarch64_pstate_za_shared" } +; CHECK: attributes #5 = { "aarch64_expanded_pstate_za" "aarch64_pstate_sm_compatible" "aarch64_pstate_za_shared" } +; CHECK: attributes #6 = { argmemonly nocallback nofree nosync nounwind willreturn } +; CHECK: attributes #7 = { nocallback nofree nosync nounwind readnone willreturn } +; CHECK: attributes #8 = { nocallback nofree nosync nounwind willreturn } +; CHECK: attributes #9 = { inaccessiblememonly nocallback nofree nosync nounwind readonly willreturn } +;. diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll --- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll @@ -10,14 +10,15 @@ ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mul x8, x8, x8 -; CHECK-NEXT: sub x9, x9, x8 -; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: msub x8, x9, x9, x8 +; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: sub x10, x29, #16 -; CHECK-NEXT: stur x9, [x29, #-16] -; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh w9, [x29, #-8] ; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za @@ -42,14 +43,15 @@ ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mul x8, x8, x8 -; CHECK-NEXT: sub x9, x9, x8 -; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: msub x8, x9, x9, x8 +; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: sub x10, x29, #16 -; CHECK-NEXT: stur x9, [x29, #-16] -; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh w9, [x29, #-8] ; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: bl __addtf3 ; CHECK-NEXT: smstart za diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll --- a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false -enable-aarch64-sme-peephole-opt=false -verify-machineinstrs < %s | FileCheck %s declare void @normal_callee(); declare void @streaming_callee() "aarch64_pstate_sm_enabled"; diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll --- a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -enable-aarch64-sme-peephole-opt=false -verify-machineinstrs < %s | FileCheck %s ; This file tests the following combinations related to streaming-enabled functions: ; [ ] N -> S (Normal -> Streaming) diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll b/llvm/test/CodeGen/AArch64/sme-streaming-locally-interface.ll copy from llvm/test/CodeGen/AArch64/sme-streaming-body.ll copy to llvm/test/CodeGen/AArch64/sme-streaming-locally-interface.ll --- a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-locally-interface.ll @@ -3,7 +3,8 @@ declare void @normal_callee(); declare void @streaming_callee() "aarch64_pstate_sm_enabled"; -declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible"; + +; Streaming Caller, Locally Streaming Callee define void @locally_streaming_caller_streaming_callee() "aarch64_pstate_sm_body" nounwind { ; CHECK-LABEL: locally_streaming_caller_streaming_callee: @@ -14,8 +15,8 @@ ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: bl streaming_compatible_callee -; CHECK-NEXT: bl streaming_compatible_callee +; CHECK-NEXT: bl streaming_callee +; CHECK-NEXT: bl streaming_callee ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload @@ -24,21 +25,6 @@ ; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload ; CHECK-NEXT: ret - call void @streaming_compatible_callee(); - call void @streaming_compatible_callee(); - ret void; -} - -; Test that a streaming body and streaming interface, no smstart/smstop are emitted, -; because the function already is in streaming mode upon entry. -define void @streaming_and_locally_streaming_caller_streaming_callee() "aarch64_pstate_sm_enabled" "aarch64_pstate_sm_body" nounwind { -; CHECK-LABEL: streaming_and_locally_streaming_caller_streaming_callee: -; CHECK: // %bb.0: -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: bl streaming_callee -; CHECK-NEXT: bl streaming_callee -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret call void @streaming_callee(); call void @streaming_callee(); ret void; @@ -53,7 +39,7 @@ ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: cmp x0, #1 -; CHECK-NEXT: b.ne .LBB2_2 +; CHECK-NEXT: b.ne .LBB1_2 ; CHECK-NEXT: // %bb.1: // %if.else ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload @@ -61,7 +47,7 @@ ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB2_2: // %if.end +; CHECK-NEXT: .LBB1_2: // %if.end ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload @@ -94,9 +80,9 @@ ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: adrp x8, .LCPI2_0 ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI2_0] ; CHECK-NEXT: add v0.2d, v1.2d, v0.2d ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm @@ -112,27 +98,39 @@ ret <2 x i64> %add; } -; Test that we use the interface (not the function's body) to determine what -; streaming-mode to enter the callee. In this case the interface is normal, so -; pstate.sm must be 0 on entry and is 0 upon return from the callee. +define void @locally_streaming_caller_normal_callee() "aarch64_pstate_sm_body" nounwind { +; CHECK-LABEL: locally_streaming_caller_normal_callee: +; CHECK: // %bb.0: +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: bl normal_callee +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + + call void @normal_callee(); + ret void; +} + +define void @streaming_and_locally_streaming_caller_streaming_callee() "aarch64_pstate_sm_enabled" "aarch64_pstate_sm_body" nounwind { +; CHECK-LABEL: streaming_and_locally_streaming_caller_streaming_callee: +; CHECK: // %bb.0: +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: bl streaming_callee +; CHECK-NEXT: bl streaming_callee +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @streaming_callee(); + call void @streaming_callee(); + ret void; +} + +; Locally Streaming Caller, Locally Streaming Callee + define void @locally_streaming_caller_locally_streaming_callee() "aarch64_pstate_sm_body" nounwind { ; CHECK-LABEL: locally_streaming_caller_locally_streaming_callee: ; CHECK: // %bb.0: -; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill -; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: smstart sm -; CHECK-NEXT: smstop sm +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: bl locally_streaming_caller_streaming_callee -; CHECK-NEXT: smstart sm -; CHECK-NEXT: smstop sm -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret call void @locally_streaming_caller_streaming_callee(); @@ -175,32 +173,33 @@ declare <2 x i64> @streaming_compatible_callee_vec_args_ret(<2 x i64>) "aarch64_pstate_sm_compatible" -define {<2 x i64>, <2 x i64>} @locally_streaming_caller_compatible_callee_struct_arg_ret({<2 x i64>, <2 x i64>} %arg) "aarch64_pstate_sm_body" nounwind { +define <2 x i64> @locally_streaming_caller_compatible_callee_struct_arg_ret({<2 x i64>, <2 x i64>} %arg) "aarch64_pstate_sm_body" nounwind { ; CHECK-LABEL: locally_streaming_caller_compatible_callee_struct_arg_ret: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #112 -; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #96] // 8-byte Folded Spill -; CHECK-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: str q1, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl streaming_compatible_callee_vec_arg_struct_ret -; CHECK-NEXT: stp q1, q0, [sp] // 32-byte Folded Spill +; CHECK-NEXT: str q1, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm -; CHECK-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload -; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload -; CHECK-NEXT: add sp, sp, #112 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %v1.arg = extractvalue {<2 x i64>, <2 x i64>} %arg, 1 %res = call {<2 x i64>, <2 x i64>} @streaming_compatible_callee_vec_arg_struct_ret(<2 x i64> %v1.arg) "aarch64_pstate_sm_compatible" - ret {<2 x i64>, <2 x i64>} %res; + %v1.res = extractvalue {<2 x i64>, <2 x i64>} %res, 1 + ret <2 x i64> %v1.res; } declare {<2 x i64>, <2 x i64>} @streaming_compatible_callee_vec_arg_struct_ret(<2 x i64>) "aarch64_pstate_sm_compatible" @@ -263,46 +262,3 @@ } declare double @llvm.cos.f64(double) - - -define float @test_arg_survives_loop(float %arg, i32 %N) nounwind "aarch64_pstate_sm_body" { -; CHECK-LABEL: test_arg_survives_loop: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #80 -; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill -; CHECK-NEXT: smstart sm -; CHECK-NEXT: .LBB9_1: // %for.body -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: subs w0, w0, #1 -; CHECK-NEXT: b.ne .LBB9_1 -; CHECK-NEXT: // %bb.2: // %for.cond.cleanup -; CHECK-NEXT: fmov s0, #1.00000000 -; CHECK-NEXT: ldr s1, [sp, #12] // 4-byte Folded Reload -; CHECK-NEXT: fadd s0, s1, s0 -; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill -; CHECK-NEXT: smstop sm -; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload -; CHECK-NEXT: add sp, sp, #80 -; CHECK-NEXT: ret -entry: - br label %for.body - -for.body: - %i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ] - %inc = add nuw nsw i32 %i.02, 1 - %exitcond.not = icmp eq i32 %inc, %N - br i1 %exitcond.not, label %for.cond.cleanup, label %for.body - -for.cond.cleanup: - %add = fadd float %arg, 1.000000e+00 - ret float %add - -} diff --git a/llvm/test/CodeGen/AArch64/sme-streaming.ll b/llvm/test/CodeGen/AArch64/sme-streaming.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-streaming.ll @@ -0,0 +1,23 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s + +; Streaming mode functions can use the full SME instruction set and the subset +; of SVE and NEON instructions that are legal in streaming mode. +; + +; CHECK: sclamp z0.b, z0.b, z0.b +define @streaming_compatible_sme( %x) { + %1 = call asm "sclamp $0.b, $0.b, $0.b", "=w,0"( %x) + ret %1 +} + +; CHECK: add z0.b, z0.b, z0.b +define @streaming_compatible_sve( %x) { + %1 = call asm "add $0.b, $0.b, $0.b", "=w,0"( %x) + ret %1 +} + +; CHECK: fmulx s0, s0, s0 +define float @streaming_compatible_neon(float %x) { + %1 = call float asm "fmulx ${0:s}, ${0:s}, ${0:s}", "=w,0"(float %x) + ret float %1 +} diff --git a/llvm/test/CodeGen/AArch64/sme-tailcall.ll b/llvm/test/CodeGen/AArch64/sme-tailcall.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-tailcall.ll @@ -0,0 +1,194 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -opaque-pointers -mattr=+sme < %s | FileCheck %s + +declare void @normal_callee(); +declare void @streaming_callee() "aarch64_pstate_sm_enabled"; +declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible"; + + +; Caller is non-streaming mode + +define void @non_streaming_caller_to_streaming_callee() nounwind { +; CHECK-LABEL: non_streaming_caller_to_streaming_callee: +; CHECK: // %bb.0: // %entry +; CHECK: smstart sm +; CHECK: bl streaming_callee +; CHECK: smstop sm +entry: + tail call void @streaming_callee() "aarch64_pstate_sm_enabled" + ret void +} + +define void @non_streaming_caller_to_streaming_compatible_callee() nounwind { +; CHECK-LABEL: non_streaming_caller_to_streaming_compatible_callee: +; CHECK: // %bb.0: // %entry +; CHECK-NOT: {{smstart|smstop}} +; CHECK: b streaming_compatible_callee +; CHECK-NOT: {{smstart|smstop}} +entry: + tail call void @streaming_compatible_callee() "aarch64_pstate_sm_compatible" + ret void +} + +; Caller is streaming mode + +define void @streaming_caller_to_streaming_callee() nounwind "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: streaming_caller_to_streaming_callee: +; CHECK: // %bb.0: // %entry +; CHECK-NOT: {{smstart|smstop}} +; CHECK-NEXT: b streaming_callee +; CHECK-NOT: {{smstart|smstop}} +entry: + tail call void @streaming_callee() "aarch64_pstate_sm_enabled" + ret void +} + +define void @streaming_caller_to_non_streaming_callee() nounwind "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: streaming_caller_to_non_streaming_callee: +; CHECK: // %bb.0: // %entry +; CHECK: smstop sm +; CHECK: bl normal_callee +; CHECK: smstart sm +entry: + tail call void @normal_callee() + ret void +} + +define void @streaming_caller_to_streaming_compatible_callee() nounwind "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: streaming_caller_to_streaming_compatible_callee: +; CHECK: // %bb.0: // %entry +; CHECK-NOT: {{smstart|smstop}} +; CHECK: b streaming_compatible_callee +; CHECK-NOT: {{smstart|smstop}} +entry: + tail call void @streaming_compatible_callee() "aarch64_pstate_sm_compatible" + ret void +} + +; Caller is streaming compatible mode + +define void @streaming_compatible_caller_to_streaming_callee() nounwind "aarch64_pstate_sm_compatible"{ +; CHECK-LABEL: streaming_compatible_caller_to_streaming_callee: +; CHECK: // %bb.1: // %entry +; CHECK: smstart sm +; CHECK: bl streaming_callee +; CHECK: smstop sm +entry: + tail call void @streaming_callee() "aarch64_pstate_sm_enabled" + ret void +} + +define void @streaming_compatible_caller_to_non_streaming_callee() nounwind "aarch64_pstate_sm_compatible"{ +; CHECK-LABEL: streaming_compatible_caller_to_non_streaming_callee: +; CHECK: // %bb.1: // %entry +; CHECK: smstop sm +; CHECK: bl normal_callee +; CHECK: smstart sm +entry: + tail call void @normal_callee() + ret void +} + +define void @streaming_compatible_caller_to_streaming_compatible_callee() nounwind "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: streaming_compatible_caller_to_streaming_compatible_callee: +; CHECK: // %bb.0: // %entry +; CHECK-NOT: {{smstart|smstop}} +; CHECK: b streaming_compatible_callee +; CHECK-NOT: {{smstart|smstop}} +entry: + tail call void @streaming_compatible_callee() "aarch64_pstate_sm_compatible" + ret void +} + +declare void @za_new_callee() "aarch64_pstate_za_new"; +declare void @za_shared_callee() "aarch64_pstate_za_shared"; +declare void @za_preserved_callee() "aarch64_pstate_za_preserved"; + +; Caller with ZA state new + +define void @za_new_caller_to_za_new_callee() nounwind "aarch64_pstate_za_new" { +; CHECK-LABEL: za_new_caller_to_za_new_callee: +; CHECK: // %bb.0: // %prelude +; CHECK: bl __arm_tpidr2_save +; CHECK: smstart za +; CHECK: bl za_new_callee +; CHECK: smstart za +; CHECK: bl __arm_tpidr2_restore +; CHECK: smstop za +entry: + tail call void @za_new_callee() "aarch64_pstate_za_new"; + ret void; +} + + +define void @za_new_caller_to_za_shared_callee() nounwind "aarch64_pstate_za_new" { +; CHECK-LABEL: za_new_caller_to_za_shared_callee: +; CHECK: // %bb.0: // %prelude +; CHECK: bl __arm_tpidr2_save +; CHECK: smstart za +; CHECK: bl za_shared_callee +; CHECK: smstop za +entry: + tail call void @za_shared_callee() "aarch64_pstate_za_shared"; + ret void; +} + +define void @za_new_caller_to_za_preserved_callee() nounwind "aarch64_pstate_za_new" { +; CHECK-LABEL: za_new_caller_to_za_preserved_callee: +; CHECK: // %bb.0: // %prelude +; CHECK: bl __arm_tpidr2_save +; CHECK: smstart za +; CHECK: bl za_preserved_callee +; CHECK: smstop za +entry: + tail call void @za_preserved_callee() "aarch64_pstate_za_preserved"; + ret void; +} + + +; Caller with ZA state shared + +define void @za_shared_caller_to_za_new_callee() nounwind "aarch64_pstate_za_shared" { +; CHECK-LABEL: za_shared_caller_to_za_new_callee: +; CHECK: // %bb.0: // %entry +; CHECK: bl za_new_callee +; CHECK-NEXT: smstart za +; CHECK: bl __arm_tpidr2_restore +entry: + tail call void @za_new_callee() "aarch64_pstate_za_new"; + ret void; +} + +define void @za_shared_caller_to_za_shared_callee() nounwind "aarch64_pstate_za_shared" { +; CHECK-LABEL: za_shared_caller_to_za_shared_callee: +; CHECK: // %bb.0: // %entry +; CHECK-NOT: {{smstart|smstop}} +; CHECK: b za_shared_callee +; CHECK-NOT: {{smstart|smstop}} +entry: + tail call void @za_shared_callee() "aarch64_pstate_za_shared"; + ret void; +} + +define void @za_shared_caller_to_za_preserved_callee() nounwind "aarch64_pstate_za_shared" { +; CHECK-LABEL: za_shared_caller_to_za_preserved_callee: +; CHECK: // %bb.0: // %entry +; CHECK-NOT: {{smstart|smstop}} +; CHECK: b za_preserved_callee +; CHECK-NOT: {{smstart|smstop}} +entry: + tail call void @za_preserved_callee() "aarch64_pstate_za_preserved"; + ret void; +} + +; Caller with ZA state preserved + +define void @za_preserved_caller_to_za_preserved_callee() nounwind "aarch64_pstate_za_preserved" { +; CHECK-LABEL: za_preserved_caller_to_za_preserved_callee: +; CHECK: // %bb.0: // %entry +; CHECK-NOT: {{smstart|smstop}} +; CHECK-NEXT: b za_preserved_callee +; CHECK-NOT: {{smstart|smstop}} +entry: + tail call void @za_preserved_callee() "aarch64_pstate_za_preserved"; + ret void; +} diff --git a/llvm/test/CodeGen/AArch64/sme-toggle-pstateza.ll b/llvm/test/CodeGen/AArch64/sme-toggle-pstateza.ll --- a/llvm/test/CodeGen/AArch64/sme-toggle-pstateza.ll +++ b/llvm/test/CodeGen/AArch64/sme-toggle-pstateza.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64 -mattr=+sme -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64 -mattr=+sme -enable-aarch64-sme-peephole-opt=false -verify-machineinstrs < %s | FileCheck %s define void @toggle_pstate_za() { ; CHECK-LABEL: toggle_pstate_za: diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-preserve.ll b/llvm/test/CodeGen/AArch64/sme-zt0-preserve.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-zt0-preserve.ll @@ -0,0 +1,455 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false -enable-aarch64-sme-peephole-opt=false -verify-machineinstrs < %s | FileCheck %s --check-prefix=SME +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -start-after=simplifycfg -enable-tail-merge=false -enable-aarch64-sme-peephole-opt=false -verify-machineinstrs < %s | FileCheck %s --check-prefix=SME2 + +; Normal callee, no ZA state +declare void @normal_callee(); + +; Callees with ZA state +declare void @za_shared_callee() "aarch64_pstate_za_shared"; +declare void @za_new_callee() "aarch64_pstate_za_new"; + +; Callee with preserved ZA state +declare void @za_preserved_callee() "aarch64_pstate_za_preserved"; + + +define void @za_new_caller_normal_callee() "aarch64_pstate_za_new" nounwind { +; SME-LABEL: za_new_caller_normal_callee: +; SME: // %bb.0: // %prelude +; SME-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; SME-NEXT: mov x29, sp +; SME-NEXT: sub sp, sp, #16 +; SME-NEXT: mov x8, sp +; SME-NEXT: rdsvl x9, #1 +; SME-NEXT: msub x8, x9, x9, x8 +; SME-NEXT: mov sp, x8 +; SME-NEXT: stur wzr, [x29, #-4] +; SME-NEXT: sturh wzr, [x29, #-6] +; SME-NEXT: stur x8, [x29, #-16] +; SME-NEXT: mrs x8, TPIDR2_EL0 +; SME-NEXT: cbz x8, .LBB0_2 +; SME-NEXT: // %bb.1: // %save.za +; SME-NEXT: bl __arm_tpidr2_save +; SME-NEXT: msr TPIDR2_EL0, xzr +; SME-NEXT: .LBB0_2: +; SME-NEXT: smstart za +; SME-NEXT: sub x8, x29, #16 +; SME-NEXT: sturh w9, [x29, #-8] +; SME-NEXT: msr TPIDR2_EL0, x8 +; SME-NEXT: bl normal_callee +; SME-NEXT: smstart za +; SME-NEXT: sub x0, x29, #16 +; SME-NEXT: mrs x8, TPIDR2_EL0 +; SME-NEXT: cbnz x8, .LBB0_4 +; SME-NEXT: // %bb.3: +; SME-NEXT: bl __arm_tpidr2_restore +; SME-NEXT: .LBB0_4: +; SME-NEXT: msr TPIDR2_EL0, xzr +; SME-NEXT: smstop za +; SME-NEXT: mov sp, x29 +; SME-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; SME-NEXT: ret +; +; SME2-LABEL: za_new_caller_normal_callee: +; SME2: // %bb.0: // %prelude +; SME2-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; SME2-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; SME2-NEXT: mov x29, sp +; SME2-NEXT: sub sp, sp, #80 +; SME2-NEXT: mov x8, sp +; SME2-NEXT: rdsvl x9, #1 +; SME2-NEXT: msub x8, x9, x9, x8 +; SME2-NEXT: mov sp, x8 +; SME2-NEXT: stur wzr, [x29, #-4] +; SME2-NEXT: sturh wzr, [x29, #-6] +; SME2-NEXT: stur x8, [x29, #-16] +; SME2-NEXT: mrs x8, TPIDR2_EL0 +; SME2-NEXT: cbz x8, .LBB0_2 +; SME2-NEXT: // %bb.1: // %save.za +; SME2-NEXT: bl __arm_tpidr2_save +; SME2-NEXT: msr TPIDR2_EL0, xzr +; SME2-NEXT: .LBB0_2: +; SME2-NEXT: smstart za +; SME2-NEXT: sub x8, x29, #16 +; SME2-NEXT: sub x19, x29, #80 +; SME2-NEXT: zero { zt0 } +; SME2-NEXT: sturh w9, [x29, #-8] +; SME2-NEXT: msr TPIDR2_EL0, x8 +; SME2-NEXT: str zt0, [x19] +; SME2-NEXT: bl normal_callee +; SME2-NEXT: smstart za +; SME2-NEXT: ldr zt0, [x19] +; SME2-NEXT: sub x0, x29, #16 +; SME2-NEXT: mrs x8, TPIDR2_EL0 +; SME2-NEXT: cbnz x8, .LBB0_4 +; SME2-NEXT: // %bb.3: +; SME2-NEXT: bl __arm_tpidr2_restore +; SME2-NEXT: .LBB0_4: +; SME2-NEXT: msr TPIDR2_EL0, xzr +; SME2-NEXT: smstop za +; SME2-NEXT: mov sp, x29 +; SME2-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; SME2-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; SME2-NEXT: ret + call void @normal_callee(); + ret void; +} + +define void @za_new_caller_za_callee() "aarch64_pstate_za_new" nounwind { +; SME-LABEL: za_new_caller_za_callee: +; SME: // %bb.0: // %prelude +; SME-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; SME-NEXT: mov x29, sp +; SME-NEXT: sub sp, sp, #16 +; SME-NEXT: mov x8, sp +; SME-NEXT: rdsvl x9, #1 +; SME-NEXT: msub x8, x9, x9, x8 +; SME-NEXT: mov sp, x8 +; SME-NEXT: stur wzr, [x29, #-4] +; SME-NEXT: sturh wzr, [x29, #-6] +; SME-NEXT: stur x8, [x29, #-16] +; SME-NEXT: mrs x8, TPIDR2_EL0 +; SME-NEXT: cbz x8, .LBB1_2 +; SME-NEXT: // %bb.1: // %save.za +; SME-NEXT: bl __arm_tpidr2_save +; SME-NEXT: msr TPIDR2_EL0, xzr +; SME-NEXT: .LBB1_2: +; SME-NEXT: smstart za +; SME-NEXT: sub x8, x29, #16 +; SME-NEXT: sturh w9, [x29, #-8] +; SME-NEXT: msr TPIDR2_EL0, x8 +; SME-NEXT: bl za_new_callee +; SME-NEXT: smstart za +; SME-NEXT: sub x0, x29, #16 +; SME-NEXT: mrs x8, TPIDR2_EL0 +; SME-NEXT: cbnz x8, .LBB1_4 +; SME-NEXT: // %bb.3: +; SME-NEXT: bl __arm_tpidr2_restore +; SME-NEXT: .LBB1_4: +; SME-NEXT: msr TPIDR2_EL0, xzr +; SME-NEXT: bl za_shared_callee +; SME-NEXT: smstop za +; SME-NEXT: mov sp, x29 +; SME-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; SME-NEXT: ret +; +; SME2-LABEL: za_new_caller_za_callee: +; SME2: // %bb.0: // %prelude +; SME2-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; SME2-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; SME2-NEXT: mov x29, sp +; SME2-NEXT: sub sp, sp, #144 +; SME2-NEXT: mov x8, sp +; SME2-NEXT: rdsvl x9, #1 +; SME2-NEXT: msub x8, x9, x9, x8 +; SME2-NEXT: mov sp, x8 +; SME2-NEXT: stur wzr, [x29, #-4] +; SME2-NEXT: sturh wzr, [x29, #-6] +; SME2-NEXT: stur x8, [x29, #-16] +; SME2-NEXT: mrs x8, TPIDR2_EL0 +; SME2-NEXT: cbz x8, .LBB1_2 +; SME2-NEXT: // %bb.1: // %save.za +; SME2-NEXT: bl __arm_tpidr2_save +; SME2-NEXT: msr TPIDR2_EL0, xzr +; SME2-NEXT: .LBB1_2: +; SME2-NEXT: smstart za +; SME2-NEXT: sub x8, x29, #16 +; SME2-NEXT: sub x19, x29, #80 +; SME2-NEXT: zero { zt0 } +; SME2-NEXT: sturh w9, [x29, #-8] +; SME2-NEXT: msr TPIDR2_EL0, x8 +; SME2-NEXT: str zt0, [x19] +; SME2-NEXT: bl za_new_callee +; SME2-NEXT: smstart za +; SME2-NEXT: ldr zt0, [x19] +; SME2-NEXT: sub x0, x29, #16 +; SME2-NEXT: mrs x8, TPIDR2_EL0 +; SME2-NEXT: cbnz x8, .LBB1_4 +; SME2-NEXT: // %bb.3: +; SME2-NEXT: bl __arm_tpidr2_restore +; SME2-NEXT: .LBB1_4: +; SME2-NEXT: sub x19, x29, #144 +; SME2-NEXT: msr TPIDR2_EL0, xzr +; SME2-NEXT: str zt0, [x19] +; SME2-NEXT: bl za_shared_callee +; SME2-NEXT: ldr zt0, [x19] +; SME2-NEXT: smstop za +; SME2-NEXT: mov sp, x29 +; SME2-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; SME2-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; SME2-NEXT: ret + call void @za_new_callee(); + call void @za_shared_callee(); + ret void; +} + +define void @za_shared_caller_normal_callee() "aarch64_pstate_za_shared" nounwind { +; SME-LABEL: za_shared_caller_normal_callee: +; SME: // %bb.0: +; SME-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; SME-NEXT: mov x29, sp +; SME-NEXT: sub sp, sp, #16 +; SME-NEXT: mov x8, sp +; SME-NEXT: rdsvl x9, #1 +; SME-NEXT: msub x8, x9, x9, x8 +; SME-NEXT: mov sp, x8 +; SME-NEXT: sub x10, x29, #16 +; SME-NEXT: stur wzr, [x29, #-4] +; SME-NEXT: sturh wzr, [x29, #-6] +; SME-NEXT: stur x8, [x29, #-16] +; SME-NEXT: sturh w9, [x29, #-8] +; SME-NEXT: msr TPIDR2_EL0, x10 +; SME-NEXT: bl normal_callee +; SME-NEXT: smstart za +; SME-NEXT: sub x0, x29, #16 +; SME-NEXT: mrs x8, TPIDR2_EL0 +; SME-NEXT: cbnz x8, .LBB2_2 +; SME-NEXT: // %bb.1: +; SME-NEXT: bl __arm_tpidr2_restore +; SME-NEXT: .LBB2_2: +; SME-NEXT: msr TPIDR2_EL0, xzr +; SME-NEXT: mov sp, x29 +; SME-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; SME-NEXT: ret +; +; SME2-LABEL: za_shared_caller_normal_callee: +; SME2: // %bb.0: +; SME2-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; SME2-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; SME2-NEXT: mov x29, sp +; SME2-NEXT: sub sp, sp, #80 +; SME2-NEXT: mov x8, sp +; SME2-NEXT: rdsvl x9, #1 +; SME2-NEXT: msub x8, x9, x9, x8 +; SME2-NEXT: mov sp, x8 +; SME2-NEXT: sub x10, x29, #16 +; SME2-NEXT: sub x19, x29, #80 +; SME2-NEXT: stur wzr, [x29, #-4] +; SME2-NEXT: sturh wzr, [x29, #-6] +; SME2-NEXT: stur x8, [x29, #-16] +; SME2-NEXT: sturh w9, [x29, #-8] +; SME2-NEXT: msr TPIDR2_EL0, x10 +; SME2-NEXT: str zt0, [x19] +; SME2-NEXT: bl normal_callee +; SME2-NEXT: smstart za +; SME2-NEXT: ldr zt0, [x19] +; SME2-NEXT: sub x0, x29, #16 +; SME2-NEXT: mrs x8, TPIDR2_EL0 +; SME2-NEXT: cbnz x8, .LBB2_2 +; SME2-NEXT: // %bb.1: +; SME2-NEXT: bl __arm_tpidr2_restore +; SME2-NEXT: .LBB2_2: +; SME2-NEXT: msr TPIDR2_EL0, xzr +; SME2-NEXT: mov sp, x29 +; SME2-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; SME2-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; SME2-NEXT: ret + call void @normal_callee(); + ret void; +} + +define void @za_shared_caller_za_callee() "aarch64_pstate_za_shared" nounwind { +; SME-LABEL: za_shared_caller_za_callee: +; SME: // %bb.0: +; SME-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; SME-NEXT: mov x29, sp +; SME-NEXT: sub sp, sp, #16 +; SME-NEXT: mov x8, sp +; SME-NEXT: rdsvl x9, #1 +; SME-NEXT: msub x8, x9, x9, x8 +; SME-NEXT: mov sp, x8 +; SME-NEXT: sub x10, x29, #16 +; SME-NEXT: stur wzr, [x29, #-4] +; SME-NEXT: sturh wzr, [x29, #-6] +; SME-NEXT: stur x8, [x29, #-16] +; SME-NEXT: sturh w9, [x29, #-8] +; SME-NEXT: msr TPIDR2_EL0, x10 +; SME-NEXT: bl za_new_callee +; SME-NEXT: smstart za +; SME-NEXT: sub x0, x29, #16 +; SME-NEXT: mrs x8, TPIDR2_EL0 +; SME-NEXT: cbnz x8, .LBB3_2 +; SME-NEXT: // %bb.1: +; SME-NEXT: bl __arm_tpidr2_restore +; SME-NEXT: .LBB3_2: +; SME-NEXT: msr TPIDR2_EL0, xzr +; SME-NEXT: bl za_shared_callee +; SME-NEXT: mov sp, x29 +; SME-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; SME-NEXT: ret +; +; SME2-LABEL: za_shared_caller_za_callee: +; SME2: // %bb.0: +; SME2-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; SME2-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; SME2-NEXT: mov x29, sp +; SME2-NEXT: sub sp, sp, #144 +; SME2-NEXT: mov x8, sp +; SME2-NEXT: rdsvl x9, #1 +; SME2-NEXT: msub x8, x9, x9, x8 +; SME2-NEXT: mov sp, x8 +; SME2-NEXT: sub x10, x29, #16 +; SME2-NEXT: sub x19, x29, #80 +; SME2-NEXT: stur wzr, [x29, #-4] +; SME2-NEXT: sturh wzr, [x29, #-6] +; SME2-NEXT: stur x8, [x29, #-16] +; SME2-NEXT: sturh w9, [x29, #-8] +; SME2-NEXT: msr TPIDR2_EL0, x10 +; SME2-NEXT: str zt0, [x19] +; SME2-NEXT: bl za_new_callee +; SME2-NEXT: smstart za +; SME2-NEXT: ldr zt0, [x19] +; SME2-NEXT: sub x0, x29, #16 +; SME2-NEXT: mrs x8, TPIDR2_EL0 +; SME2-NEXT: cbnz x8, .LBB3_2 +; SME2-NEXT: // %bb.1: +; SME2-NEXT: bl __arm_tpidr2_restore +; SME2-NEXT: .LBB3_2: +; SME2-NEXT: sub x19, x29, #144 +; SME2-NEXT: msr TPIDR2_EL0, xzr +; SME2-NEXT: str zt0, [x19] +; SME2-NEXT: bl za_shared_callee +; SME2-NEXT: ldr zt0, [x19] +; SME2-NEXT: mov sp, x29 +; SME2-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; SME2-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; SME2-NEXT: ret + call void @za_new_callee(); + call void @za_shared_callee(); + ret void; +} + +define void @za_new_caller_za_preserved_callee() "aarch64_pstate_za_new" nounwind { +; SME-LABEL: za_new_caller_za_preserved_callee: +; SME: // %bb.0: // %prelude +; SME-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; SME-NEXT: mov x29, sp +; SME-NEXT: sub sp, sp, #16 +; SME-NEXT: mov x8, sp +; SME-NEXT: rdsvl x9, #1 +; SME-NEXT: msub x8, x9, x9, x8 +; SME-NEXT: mov sp, x8 +; SME-NEXT: stur wzr, [x29, #-4] +; SME-NEXT: sturh wzr, [x29, #-6] +; SME-NEXT: stur x8, [x29, #-16] +; SME-NEXT: mrs x8, TPIDR2_EL0 +; SME-NEXT: cbz x8, .LBB4_2 +; SME-NEXT: // %bb.1: // %save.za +; SME-NEXT: bl __arm_tpidr2_save +; SME-NEXT: msr TPIDR2_EL0, xzr +; SME-NEXT: .LBB4_2: +; SME-NEXT: smstart za +; SME-NEXT: bl za_preserved_callee +; SME-NEXT: smstop za +; SME-NEXT: mov sp, x29 +; SME-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; SME-NEXT: ret +; +; SME2-LABEL: za_new_caller_za_preserved_callee: +; SME2: // %bb.0: // %prelude +; SME2-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; SME2-NEXT: mov x29, sp +; SME2-NEXT: sub sp, sp, #16 +; SME2-NEXT: mov x8, sp +; SME2-NEXT: rdsvl x9, #1 +; SME2-NEXT: msub x8, x9, x9, x8 +; SME2-NEXT: mov sp, x8 +; SME2-NEXT: stur wzr, [x29, #-4] +; SME2-NEXT: sturh wzr, [x29, #-6] +; SME2-NEXT: stur x8, [x29, #-16] +; SME2-NEXT: mrs x8, TPIDR2_EL0 +; SME2-NEXT: cbz x8, .LBB4_2 +; SME2-NEXT: // %bb.1: // %save.za +; SME2-NEXT: bl __arm_tpidr2_save +; SME2-NEXT: msr TPIDR2_EL0, xzr +; SME2-NEXT: .LBB4_2: +; SME2-NEXT: smstart za +; SME2-NEXT: zero { zt0 } +; SME2-NEXT: bl za_preserved_callee +; SME2-NEXT: smstop za +; SME2-NEXT: mov sp, x29 +; SME2-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; SME2-NEXT: ret + call void @za_preserved_callee(); + ret void; +} + +define void @za_shared_caller_za_preserved_callee() "aarch64_pstate_za_shared" nounwind { +; SME-LABEL: za_shared_caller_za_preserved_callee: +; SME: // %bb.0: +; SME-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; SME-NEXT: mov x29, sp +; SME-NEXT: sub sp, sp, #16 +; SME-NEXT: mov x8, sp +; SME-NEXT: rdsvl x9, #1 +; SME-NEXT: msub x8, x9, x9, x8 +; SME-NEXT: mov sp, x8 +; SME-NEXT: stur wzr, [x29, #-4] +; SME-NEXT: sturh wzr, [x29, #-6] +; SME-NEXT: stur x8, [x29, #-16] +; SME-NEXT: bl za_preserved_callee +; SME-NEXT: mov sp, x29 +; SME-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; SME-NEXT: ret +; +; SME2-LABEL: za_shared_caller_za_preserved_callee: +; SME2: // %bb.0: +; SME2-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; SME2-NEXT: mov x29, sp +; SME2-NEXT: sub sp, sp, #16 +; SME2-NEXT: mov x8, sp +; SME2-NEXT: rdsvl x9, #1 +; SME2-NEXT: msub x8, x9, x9, x8 +; SME2-NEXT: mov sp, x8 +; SME2-NEXT: stur wzr, [x29, #-4] +; SME2-NEXT: sturh wzr, [x29, #-6] +; SME2-NEXT: stur x8, [x29, #-16] +; SME2-NEXT: bl za_preserved_callee +; SME2-NEXT: mov sp, x29 +; SME2-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; SME2-NEXT: ret + call void @za_preserved_callee(); + ret void; +} + +define void @za_preserved_caller_za_callee() "aarch64_pstate_za_preserved" nounwind { +; SME-LABEL: za_preserved_caller_za_callee: +; SME: // %bb.0: +; SME-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; SME-NEXT: bl normal_callee +; SME-NEXT: bl za_new_callee +; SME-NEXT: bl za_shared_callee +; SME-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; SME-NEXT: ret +; +; SME2-LABEL: za_preserved_caller_za_callee: +; SME2: // %bb.0: +; SME2-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; SME2-NEXT: bl normal_callee +; SME2-NEXT: bl za_new_callee +; SME2-NEXT: bl za_shared_callee +; SME2-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; SME2-NEXT: ret + call void @normal_callee(); + call void @za_new_callee(); + call void @za_shared_callee(); + ret void; +} + +define void @za_preserved_caller_za_preserved_callee() "aarch64_pstate_za_preserved" nounwind { +; SME-LABEL: za_preserved_caller_za_preserved_callee: +; SME: // %bb.0: +; SME-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; SME-NEXT: bl za_preserved_callee +; SME-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; SME-NEXT: ret +; +; SME2-LABEL: za_preserved_caller_za_preserved_callee: +; SME2: // %bb.0: +; SME2-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; SME2-NEXT: bl za_preserved_callee +; SME2-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; SME2-NEXT: ret + call void @za_preserved_callee(); + ret void; +} diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll @@ -90,9 +90,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w0 ; CHECK-NEXT: mov { z0.d, z1.d }, za0h.d[w12, 0:1] +; CHECK-NEXT: mov { z0.d, z1.d }, za7h.d[w12, 0:1] ; CHECK-NEXT: ret %res = call { , } @llvm.aarch64.sme.read.hor.vg2.nxv2i64(i32 0, i32 %slice) - ret { , } %res + %slice.0 = add i32 %slice, 0 + %res2 = call { , } @llvm.aarch64.sme.read.hor.vg2.nxv2i64(i32 7, i32 %slice.0) + ret { , } %res2 } define { , } @za_read_horiz_vg2_f64(i32 %slice) { @@ -100,9 +103,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w0 ; CHECK-NEXT: mov { z0.d, z1.d }, za0h.d[w12, 0:1] +; CHECK-NEXT: mov { z0.d, z1.d }, za7h.d[w12, 0:1] ; CHECK-NEXT: ret %res = call { , } @llvm.aarch64.sme.read.hor.vg2.nxv2f64(i32 0, i32 %slice) - ret { , } %res + %slice.0 = add i32 %slice, 0 + %res2 = call { , } @llvm.aarch64.sme.read.hor.vg2.nxv2f64(i32 7, i32 %slice.0) + ret { , } %res2 } ; Vertical @@ -190,9 +196,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w0 ; CHECK-NEXT: mov { z0.d, z1.d }, za0v.d[w12, 0:1] +; CHECK-NEXT: mov { z0.d, z1.d }, za7v.d[w12, 0:1] ; CHECK-NEXT: ret %res = call { , } @llvm.aarch64.sme.read.ver.vg2.nxv2i64(i32 0, i32 %slice) - ret { , } %res + %slice.0 = add i32 %slice, 0 + %res2 = call { , } @llvm.aarch64.sme.read.ver.vg2.nxv2i64(i32 7, i32 %slice.0) + ret { , } %res2 } define { , } @za_read_vert_vg2_f64(i32 %slice) { @@ -200,9 +209,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w0 ; CHECK-NEXT: mov { z0.d, z1.d }, za0v.d[w12, 0:1] +; CHECK-NEXT: mov { z0.d, z1.d }, za7v.d[w12, 0:1] ; CHECK-NEXT: ret %res = call { , } @llvm.aarch64.sme.read.ver.vg2.nxv2f64(i32 0, i32 %slice) - ret { , } %res + %slice.0 = add i32 %slice, 0 + %res2 = call { , } @llvm.aarch64.sme.read.ver.vg2.nxv2f64(i32 7, i32 %slice.0) + ret { , } %res2 } ; @@ -268,9 +280,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w0 ; CHECK-NEXT: mov { z0.s - z3.s }, za0h.s[w12, 0:3] +; CHECK-NEXT: mov { z0.s - z3.s }, za3h.s[w12, 0:3] ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv4i32(i32 0, i32 %slice) - ret { , , , } %res + %slice.0 = add i32 %slice, 0 + %res2 = call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv4i32(i32 3, i32 %slice.0) + ret { , , , } %res2 } define { , , , } @za_read_horiz_vg4_f32(i32 %slice) { @@ -278,9 +293,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w0 ; CHECK-NEXT: mov { z0.s - z3.s }, za0h.s[w12, 0:3] +; CHECK-NEXT: mov { z0.s - z3.s }, za3h.s[w12, 0:3] ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv4f32(i32 0, i32 %slice) - ret { , , , } %res + %slice.0 = add i32 %slice, 0 + %res2 = call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv4f32(i32 3, i32 %slice.0) + ret { , , , } %res2 } define { , , , } @za_read_horiz_vg4_d(i32 %slice) { @@ -288,9 +306,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w0 ; CHECK-NEXT: mov { z0.d - z3.d }, za0h.d[w12, 0:3] +; CHECK-NEXT: mov { z0.d - z3.d }, za7h.d[w12, 0:3] ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv2i64(i32 0, i32 %slice) - ret { , , , } %res + %slice.0 = add i32 %slice, 0 + %res2 = call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv2i64(i32 7, i32 %slice.0) + ret { , , , } %res2 } define { , , , } @za_read_horiz_vg4_f64(i32 %slice) { @@ -298,9 +319,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w0 ; CHECK-NEXT: mov { z0.d - z3.d }, za0h.d[w12, 0:3] +; CHECK-NEXT: mov { z0.d - z3.d }, za7h.d[w12, 0:3] ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv2f64(i32 0, i32 %slice) - ret { , , , } %res + %slice.0 = add i32 %slice, 0 + %res2 = call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv2f64(i32 7, i32 %slice.0) + ret { , , , } %res2 } ; Vertical @@ -362,9 +386,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w0 ; CHECK-NEXT: mov { z0.s - z3.s }, za0v.s[w12, 0:3] +; CHECK-NEXT: mov { z0.s - z3.s }, za3v.s[w12, 0:3] ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv4i32(i32 0, i32 %slice) - ret { , , , } %res + %slice.0 = add i32 %slice, 0 + %res2 = call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv4i32(i32 3, i32 %slice.0) + ret { , , , } %res2 } define { , , , } @za_read_vert_vg4_f32(i32 %slice) { @@ -372,9 +399,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w0 ; CHECK-NEXT: mov { z0.s - z3.s }, za0v.s[w12, 0:3] +; CHECK-NEXT: mov { z0.s - z3.s }, za3v.s[w12, 0:3] ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv4f32(i32 0, i32 %slice) - ret { , , , } %res + %slice.0 = add i32 %slice, 0 + %res2 = call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv4f32(i32 3, i32 %slice.0) + ret { , , , } %res2 } define { , , , } @za_read_vert_vg4_d(i32 %slice) { @@ -382,9 +412,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w0 ; CHECK-NEXT: mov { z0.d - z3.d }, za0v.d[w12, 0:3] +; CHECK-NEXT: mov { z0.d - z3.d }, za7v.d[w12, 0:3] ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv2i64(i32 0, i32 %slice) - ret { , , , } %res + %slice.0 = add i32 %slice, 0 + %res2 = call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv2i64(i32 7, i32 %slice.0) + ret { , , , } %res2 } define { , , , } @za_read_vert_vg4_f64(i32 %slice) { @@ -392,9 +425,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w0 ; CHECK-NEXT: mov { z0.d - z3.d }, za0v.d[w12, 0:3] +; CHECK-NEXT: mov { z0.d - z3.d }, za7v.d[w12, 0:3] ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv2f64(i32 0, i32 %slice) - ret { , , , } %res + %slice.0 = add i32 %slice, 0 + %res2 = call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv2f64(i32 7, i32 %slice.0) + ret { , , , } %res2 } ; Move Multi-Vector From ZA (Read) x2 diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti2-lane-x2.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti2-lane-x2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti2-lane-x2.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s + +; lookup table expand one register + +define {, } @luti2_i8( %x) { +; CHECK-LABEL: luti2_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: luti2 { z0.b, z1.b }, zt0, z0[0] +; CHECK-NEXT: ret + %res = call {, } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv16i8(i32 0, %x, i32 0) + ret {, } %res +} + +define {, } @luti2_i16( %x) { +; CHECK-LABEL: luti2_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: luti2 { z0.h, z1.h }, zt0, z0[7] +; CHECK-NEXT: ret + %res = call {, } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv8i16(i32 0, %x, i32 7) + ret {, } %res +} + +define {, } @luti2_i32( %x) { +; CHECK-LABEL: luti2_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: luti2 { z0.s, z1.s }, zt0, z0[7] +; CHECK-NEXT: ret + %res = call {, } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv4i32(i32 0, %x, i32 7) + ret {, }%res +} + +declare {, } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv16i8(i32, , i32) +declare {, } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv8i16(i32, , i32) +declare {, } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv4i32(i32, , i32) diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti2-lane-x4.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti2-lane-x4.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti2-lane-x4.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s + +; lookup table expand one register + +define {, , , } @luti2_i8( %x) { +; CHECK-LABEL: luti2_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: luti2 { z0.b - z3.b }, zt0, z0[0] +; CHECK-NEXT: ret + %res = call {, , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv16i8(i32 0, %x, i32 0) + ret {, , , } %res +} + +define {, , , } @luti2_i16( %x) { +; CHECK-LABEL: luti2_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: luti2 { z0.h - z3.h }, zt0, z0[3] +; CHECK-NEXT: ret + %res = call {, , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv8i16(i32 0, %x, i32 3) + ret {, , , } %res +} + +define {, , , } @luti2_i32( %x) { +; CHECK-LABEL: luti2_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: luti2 { z0.s - z3.s }, zt0, z0[3] +; CHECK-NEXT: ret + %res = call {, , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv4i32(i32 0, %x, i32 3) + ret {, , , }%res +} + +declare {, , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv16i8(i32, , i32) +declare {, , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv8i16(i32, , i32) +declare {, , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv4i32(i32, , i32) diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti2-lane.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti2-lane.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti2-lane.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s + +; lookup table expand one register + +define @luti2_i8( %x) { +; CHECK-LABEL: luti2_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: luti2 z0.b, zt0, z0[0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sme.luti2.lane.zt.nxv16i8(i32 0, %x, i32 0) + ret %res +} + +define @luti2_i16( %x) { +; CHECK-LABEL: luti2_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: luti2 z0.h, zt0, z0[15] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sme.luti2.lane.zt.nxv8i16(i32 0, %x, i32 15) + ret %res +} + +define @luti2_i32( %x) { +; CHECK-LABEL: luti2_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: luti2 z0.s, zt0, z0[15] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sme.luti2.lane.zt.nxv4i32(i32 0, %x, i32 15) + ret %res +} + +declare @llvm.aarch64.sme.luti2.lane.zt.nxv16i8(i32, , i32) +declare @llvm.aarch64.sme.luti2.lane.zt.nxv8i16(i32, , i32) +declare @llvm.aarch64.sme.luti2.lane.zt.nxv4i32(i32, , i32) diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x2.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x2.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s + +; lookup table expand one register + +define {, } @luti4_i8( %x) { +; CHECK-LABEL: luti4_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: luti4 { z0.b, z1.b }, zt0, z0[0] +; CHECK-NEXT: ret + %res = call {, } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv16i8(i32 0, %x, i32 0) + ret {, } %res +} + +define {, } @luti4_i16( %x) { +; CHECK-LABEL: luti4_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: luti4 { z0.h, z1.h }, zt0, z0[3] +; CHECK-NEXT: ret + %res = call {, } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv8i16(i32 0, %x, i32 3) + ret {, } %res +} + +define {, } @luti4_i32( %x) { +; CHECK-LABEL: luti4_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: luti4 { z0.s, z1.s }, zt0, z0[3] +; CHECK-NEXT: ret + %res = call {, } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv4i32(i32 0, %x, i32 3) + ret {, } %res +} + +declare {, } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv16i8(i32, , i32) +declare {, } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv8i16(i32, , i32) +declare {, } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv4i32(i32, , i32) diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll @@ -0,0 +1,25 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s + +; lookup table expand one register + +define {, , , } @luti4_i16( %x) { +; CHECK-LABEL: luti4_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: luti4 { z0.h - z3.h }, zt0, z0[0] +; CHECK-NEXT: ret + %res = call {, , , } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8i16(i32 0, %x, i32 0) + ret {, , , } %res +} + +define {, , , } @luti4_i32( %x) { +; CHECK-LABEL: luti4_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: luti4 { z0.s - z3.s }, zt0, z0[1] +; CHECK-NEXT: ret + %res = call {, , , } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv4i32(i32 0, %x, i32 1) + ret {, , , } %res +} + +declare {, , , } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8i16(i32, , i32) +declare {, , , } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv4i32(i32, , i32) diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s + +; lookup table expand one register + +define @luti4_i8( %x) { +; CHECK-LABEL: luti4_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: luti4 z0.b, zt0, z0[0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sme.luti4.lane.zt.nxv16i8(i32 0, %x, i32 0) + ret %res +} + +define @luti4_i16( %x) { +; CHECK-LABEL: luti4_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: luti4 z0.h, zt0, z0[7] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sme.luti4.lane.zt.nxv8i16(i32 0, %x, i32 7) + ret %res +} + +define @luti4_i32( %x) { +; CHECK-LABEL: luti4_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: luti4 z0.s, zt0, z0[7] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sme.luti4.lane.zt.nxv4i32(i32 0, %x, i32 7) + ret %res +} + +declare @llvm.aarch64.sme.luti4.lane.zt.nxv16i8(i32, , i32) +declare @llvm.aarch64.sme.luti4.lane.zt.nxv8i16(i32, , i32) +declare @llvm.aarch64.sme.luti4.lane.zt.nxv4i32(i32, , i32) diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-zero-zt.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-zero-zt.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-zero-zt.ll @@ -0,0 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s + +define void @zero_zt0() { +; CHECK-LABEL: zero_zt0: +; CHECK: // %bb.0: +; CHECK-NEXT: zero { zt0 } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.zero.zt(i32 0) + ret void +} + +declare void @llvm.aarch64.sme.zero.zt(i32) diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-zt0.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-zt0.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-zt0.ll @@ -0,0 +1,27 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s + +; LDR + +define void @ldr_zt0(ptr %ptr) { +; CHECK-LABEL: ldr_zt0: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr zt0, [x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr %ptr) + ret void; +} + +; STR + +define void @str_zt0(ptr %ptr) { +; CHECK-LABEL: str_zt0: +; CHECK: // %bb.0: +; CHECK-NEXT: str zt0, [x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.str.zt(i32 0, ptr %ptr) + ret void; +} + +declare void @llvm.aarch64.sme.ldr.zt(i32, ptr) +declare void @llvm.aarch64.sme.str.zt(i32, ptr) diff --git a/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-max.ll b/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-max.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-max.ll @@ -0,0 +1,78 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s + +; BFMAX (Single, x2) + +define { , } @multi_vec_max_single_x2_bf16( %zdn1, %zdn2, %zm) { +; CHECK-LABEL: multi_vec_max_single_x2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: bfmax { z0.h, z1.h }, { z0.h, z1.h }, z2.h +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.bfmax.single.x2.nxv8bf16( %zdn1, %zdn2, %zm) + ret { , } %res +} + +; BFMAX (Single, x4) + +define { , , , } +@multi_vec_max_single_x4_bf16( %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +; CHECK-LABEL: multi_vec_max_single_x4_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: bfmax { z0.h - z3.h }, { z0.h - z3.h }, z4.h +; CHECK-NEXT: ret + %res = call { , , , } + @llvm.aarch64.sve.bfmax.single.x4.nxv8bf16( %zdn1, %zdn2, %zdn3, %zdn4, %zm) + ret { , , , } %res +} + +; BFMAX (Multi, x2) + +define { , } @multi_vec_max_multi_x2_bf16( %zdn1, %zdn2, %zm1, %zm2) { +; CHECK-LABEL: multi_vec_max_multi_x2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: bfmax { z0.h, z1.h }, { z0.h, z1.h }, { z2.h, z3.h } +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.bfmax.x2.nxv8bf16( %zdn1, %zdn2, %zm1, %zm2) + ret { , } %res +} + +; BFMAX (Multi, x4) + +define { , , , } +@multi_vec_max_multi_x4_bf16( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +; CHECK-LABEL: multi_vec_max_multi_x4_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: bfmax { z0.h - z3.h }, { z0.h - z3.h }, { z4.h - z7.h } +; CHECK-NEXT: ret + %res = call { , , , } + @llvm.aarch64.sve.bfmax.x4.nxv8bf16( %zdn1, %zdn2, %zdn3, %zdn4, + %zm1, %zm2, %zm3, %zm4) + ret { , , , } %res +} + +declare { , } @llvm.aarch64.sve.bfmax.single.x2.nxv8bf16(, , ) + +declare { , , , } + @llvm.aarch64.sve.bfmax.single.x4.nxv8bf16(, , , , ) + +declare { , } @llvm.aarch64.sve.bfmax.x2.nxv8bf16(, , , ) + +declare { , , , } + @llvm.aarch64.sve.bfmax.x4.nxv8bf16(, , , , , , , ) diff --git a/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-min.ll b/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-min.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-min.ll @@ -0,0 +1,79 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s + +; BFMIN (Single, x2) + +define { , } @multi_vec_min_single_x2_bf16( %zdn1, %zdn2, %zm) { +; CHECK-LABEL: multi_vec_min_single_x2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: bfmin { z0.h, z1.h }, { z0.h, z1.h }, z2.h +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.bfmin.single.x2.nxv8bf16( %zdn1, %zdn2, %zm) + ret { , } %res +} + +; BFMIN (Single, x4) + +define { , , , } +@multi_vec_min_single_x4_bf16( %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +; CHECK-LABEL: multi_vec_min_single_x4_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: bfmin { z0.h - z3.h }, { z0.h - z3.h }, z4.h +; CHECK-NEXT: ret + %res = call { , , , } + @llvm.aarch64.sve.bfmin.single.x4.nxv8bf16( %zdn1, %zdn2, %zdn3, %zdn4, %zm) + ret { , , , } %res +} + +; BFMIN (Multi, x2) + +define { , } @multi_vec_min_multi_x2_bf16( %zdn1, %zdn2, %zm1, %zm2) { +; CHECK-LABEL: multi_vec_min_multi_x2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: bfmin { z0.h, z1.h }, { z0.h, z1.h }, { z2.h, z3.h } +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.bfmin.x2.nxv8bf16( %zdn1, %zdn2, %zm1, %zm2) + ret { , } %res +} + +; BFMIN (Multi, x4) + +define { , , , } +@multi_vec_min_multi_x4_bf16( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +; CHECK-LABEL: multi_vec_min_multi_x4_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: bfmin { z0.h - z3.h }, { z0.h - z3.h }, { z4.h - z7.h } +; CHECK-NEXT: ret + %res = call { , , , } + @llvm.aarch64.sve.bfmin.x4.nxv8bf16( %zdn1, %zdn2, %zdn3, %zdn4, + %zm1, %zm2, %zm3, %zm4) + ret { , , , } %res +} + +declare { , } @llvm.aarch64.sve.bfmin.single.x2.nxv8bf16(, , ) + +declare { , , , } + @llvm.aarch64.sve.bfmin.single.x4.nxv8bf16(, , , , ) + +declare { , } @llvm.aarch64.sve.bfmin.x2.nxv8bf16(, , , ) + +declare { , , , } + @llvm.aarch64.sve.bfmin.x4.nxv8bf16(, , , , , , , ) diff --git a/llvm/test/CodeGen/AArch64/streaming-mode-no-reorder.ll b/llvm/test/CodeGen/AArch64/streaming-mode-no-reorder.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/streaming-mode-no-reorder.ll @@ -0,0 +1,20 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s + +; Test SMSTART and SMSTOP are marked as scheduling barriers and are not +; rescheduled around function calls. + +declare void @streaming_callee() "aarch64_pstate_sm_enabled"; + +@arr = global [8 x i8] zeroinitializer, align 2 + +define void @clear_arr() { +; CHECK-LABEL: clear_arr: +; CHECK: smstart sm +; CHECK-NEXT: bl streaming_callee +; CHECK-NEXT: smstop sm + call void @streaming_callee() + call void @llvm.memset.p0i8.i64(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @arr, i64 0, i64 0), i8 0, i64 8, i1 false) + ret void; +} + +declare void @llvm.memset.p0i8.i64(i8*, i8, i64, i1) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll @@ -55,6 +55,14 @@ ret %res } +define @reinterpret_bool_from_svcount(target("aarch64.svcount") %pg) "target-features"="+sme2" { +; CHECK-LABEL: reinterpret_bool_from_svcount: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount") %pg) + ret %out +} + ; ; Converting from svbool_t ; @@ -99,6 +107,15 @@ ret %out } +define target("aarch64.svcount") @reinterpret_bool_to_svcount( %pg) "target-features"="+sme2" { +; CHECK-LABEL: reinterpret_bool_to_svcount: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt( %pg) + ret target("aarch64.svcount") %out +} + + ; Reinterpreting a ptrue should not introduce an `and` instruction. define @reinterpret_ptrue() { ; CHECK-LABEL: reinterpret_ptrue: @@ -142,9 +159,11 @@ declare @llvm.aarch64.sve.convert.to.svbool.nxv4i1() declare @llvm.aarch64.sve.convert.to.svbool.nxv2i1() declare @llvm.aarch64.sve.convert.to.svbool.nxv1i1() +declare @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount")) declare @llvm.aarch64.sve.convert.from.svbool.nxv16i1() declare @llvm.aarch64.sve.convert.from.svbool.nxv8i1() declare @llvm.aarch64.sve.convert.from.svbool.nxv4i1() declare @llvm.aarch64.sve.convert.from.svbool.nxv2i1() declare @llvm.aarch64.sve.convert.from.svbool.nxv1i1() +declare target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt() diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-psel.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-psel.ll --- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-psel.ll +++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-psel.ll @@ -22,70 +22,70 @@ ret %res } -define @psel_h( %p1, %p2, i32 %idx) { +define @psel_h( %p1, %p2, i32 %idx) { ; CHECK-LABEL: psel_h: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w0 ; CHECK-NEXT: psel p0, p0, p1.h[w12, 0] ; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.psel.nxv8i1( %p1, %p2, i32 %idx) - ret %res + %res = call @llvm.aarch64.sve.psel.nxv8i1( %p1, %p2, i32 %idx) + ret %res } -define @psel_h_imm( %p1, %p2, i32 %idx) { +define @psel_h_imm( %p1, %p2, i32 %idx) { ; CHECK-LABEL: psel_h_imm: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w0 ; CHECK-NEXT: psel p0, p0, p1.h[w12, 7] ; CHECK-NEXT: ret %add = add i32 %idx, 7 - %res = call @llvm.aarch64.sve.psel.nxv8i1( %p1, %p2, i32 %add) - ret %res + %res = call @llvm.aarch64.sve.psel.nxv8i1( %p1, %p2, i32 %add) + ret %res } -define @psel_s( %p1, %p2, i32 %idx) { +define @psel_s( %p1, %p2, i32 %idx) { ; CHECK-LABEL: psel_s: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w0 ; CHECK-NEXT: psel p0, p0, p1.s[w12, 0] ; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.psel.nxv4i1( %p1, %p2, i32 %idx) - ret %res + %res = call @llvm.aarch64.sve.psel.nxv4i1( %p1, %p2, i32 %idx) + ret %res } -define @psel_s_imm( %p1, %p2, i32 %idx) { +define @psel_s_imm( %p1, %p2, i32 %idx) { ; CHECK-LABEL: psel_s_imm: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w0 ; CHECK-NEXT: psel p0, p0, p1.s[w12, 3] ; CHECK-NEXT: ret %add = add i32 %idx, 3 - %res = call @llvm.aarch64.sve.psel.nxv4i1( %p1, %p2, i32 %add) - ret %res + %res = call @llvm.aarch64.sve.psel.nxv4i1( %p1, %p2, i32 %add) + ret %res } -define @psel_d( %p1, %p2, i32 %idx) { +define @psel_d( %p1, %p2, i32 %idx) { ; CHECK-LABEL: psel_d: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w0 ; CHECK-NEXT: psel p0, p0, p1.d[w12, 0] ; CHECK-NEXT: ret - %res = call @llvm.aarch64.sve.psel.nxv2i1( %p1, %p2, i32 %idx) - ret %res + %res = call @llvm.aarch64.sve.psel.nxv2i1( %p1, %p2, i32 %idx) + ret %res } -define @psel_d_imm( %p1, %p2, i32 %idx) { +define @psel_d_imm( %p1, %p2, i32 %idx) { ; CHECK-LABEL: psel_d_imm: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w0 ; CHECK-NEXT: psel p0, p0, p1.d[w12, 1] ; CHECK-NEXT: ret %add = add i32 %idx, 1 - %res = call @llvm.aarch64.sve.psel.nxv2i1( %p1, %p2, i32 %add) - ret %res + %res = call @llvm.aarch64.sve.psel.nxv2i1( %p1, %p2, i32 %add) + ret %res } declare @llvm.aarch64.sve.psel.nxv16i1(, , i32) -declare @llvm.aarch64.sve.psel.nxv8i1(, , i32) -declare @llvm.aarch64.sve.psel.nxv4i1(, , i32) -declare @llvm.aarch64.sve.psel.nxv2i1(, , i32) +declare @llvm.aarch64.sve.psel.nxv8i1(, , i32) +declare @llvm.aarch64.sve.psel.nxv4i1(, , i32) +declare @llvm.aarch64.sve.psel.nxv2i1(, , i32) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfadd.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfadd.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfadd.ll @@ -0,0 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s + +define @bfadd( %pg, %a, %b){ +; CHECK-LABEL: bfadd: +; CHECK: // %bb.0: +; CHECK-NEXT: bfadd z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.fadd.nxv8bf16( %pg, %a, %b) + ret %res +} + +declare @llvm.aarch64.sve.fadd.nxv8bf16(, , ) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmax.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmax.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmax.ll @@ -0,0 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s + +define @bfmax( %pg, %a, %b){ +; CHECK-LABEL: bfmax: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmax z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.fmax.nxv8bf16( %pg, %a, %b) + ret %res +} + +declare @llvm.aarch64.sve.fmax.nxv8bf16(, , ) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmaxnm.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmaxnm.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmaxnm.ll @@ -0,0 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s + +define @bfmaxnm( %pg, %a, %b){ +; CHECK-LABEL: bfmaxnm: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmaxnm z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.fmaxnm.nxv8bf16( %pg, %a, %b) + ret %res +} + +declare @llvm.aarch64.sve.fmaxnm.nxv8bf16(, , ) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmin.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmin.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmin.ll @@ -0,0 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s + +define @bfmin( %pg, %a, %b){ +; CHECK-LABEL: bfmin: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmin z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.fmin.nxv8bf16( %pg, %a, %b) + ret %res +} + +declare @llvm.aarch64.sve.fmin.nxv8bf16(, , ) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfminnm.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfminnm.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfminnm.ll @@ -0,0 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s + +define @bfminnm( %pg, %a, %b){ +; CHECK-LABEL: bfminnm: +; CHECK: // %bb.0: +; CHECK-NEXT: bfminnm z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.fminnm.nxv8bf16( %pg, %a, %b) + ret %res +} + +declare @llvm.aarch64.sve.fminnm.nxv8bf16(, , ) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls.ll @@ -0,0 +1,43 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s + +define @bfmlslb_f32( %zda, %zn, %zm) { +; CHECK-LABEL: bfmlslb_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmlslb z0.s, z1.h, z2.h +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.bfmlslb( %zda, %zn, %zm) + ret %out +} + +define @bfmlslt_f32( %zda, %zn, %zm) { +; CHECK-LABEL: bfmlslt_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmlslt z0.s, z1.h, z2.h +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.bfmlslt( %zda, %zn, %zm) + ret %out +} + +define @bfmlslb_lane_f32( %zda, %zn, %zm) { +; CHECK-LABEL: bfmlslb_lane_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmlslb z0.s, z1.h, z2.h[7] +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.bfmlslb.lane( %zda, %zn, %zm, i32 7) + ret %out +} + +define @bfmlslt_lane_f32( %zda, %zn, %zm) { +; CHECK-LABEL: bfmlslt_lane_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmlslt z0.s, z1.h, z2.h[7] +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.bfmlslt.lane( %zda, %zn, %zm, i32 7) + ret %out +} + +declare @llvm.aarch64.sve.bfmlslb(, , ) +declare @llvm.aarch64.sve.bfmlslt(, , ) +declare @llvm.aarch64.sve.bfmlslb.lane(, , , i32) +declare @llvm.aarch64.sve.bfmlslt.lane(, , , i32) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul.ll @@ -0,0 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s + +define @bfmul( %pg, %a, %b){ +; CHECK-LABEL: bfmul: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmul z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.fmul.nxv8bf16( %pg, %a, %b) + ret %res +} + +declare @llvm.aarch64.sve.fmul.nxv8bf16(, , ) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul_lane.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul_lane.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul_lane.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s + +define @bfmul_lane_idx1( %a, %b) { +; CHECK-LABEL: bfmul_lane_idx1: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmul z0.h, z0.h, z1.h[1] +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.fmul.lane.nxv8bf16( %a, + %b, + i32 1) + ret %out +} + +define @bfmul_lane_idx3( %a, %b) { +; CHECK-LABEL: bfmul_lane_idx3: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmul z0.h, z0.h, z1.h[3] +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.fmul.lane.nxv8bf16( %a, + %b, + i32 3) + ret %out +} + +define @bfmul_lane_idx7( %a, %b) { +; CHECK-LABEL: bfmul_lane_idx7: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmul z0.h, z0.h, z1.h[7] +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.fmul.lane.nxv8bf16( %a, + %b, + i32 7) + ret %out +} + +declare @llvm.aarch64.sve.fmul.lane.nxv8bf16(, , i32) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfsub.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfsub.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfsub.ll @@ -0,0 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s + +define @bfsub( %pg, %a, %b){ +; CHECK-LABEL: bfsub: +; CHECK: // %bb.0: +; CHECK-NEXT: bfsub z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.fsub.nxv8bf16( %pg, %a, %b) + ret %res +} + +declare @llvm.aarch64.sve.fsub.nxv8bf16(, , ) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-cntp.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-cntp.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-cntp.ll @@ -0,0 +1,81 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s + +define i64 @test_svcntp_c8_vlx2(target("aarch64.svcount") %pn) nounwind { +; CHECK-LABEL: test_svcntp_c8_vlx2: +; CHECK: // %bb.0: +; CHECK-NEXT: cntp x0, pn0.b, vlx2 +; CHECK-NEXT: ret + %res = call i64 @llvm.aarch64.sve.cntp.c8(target("aarch64.svcount") %pn, i32 2) + ret i64 %res +} + +define i64 @test_svcntp_c8_vlx4(target("aarch64.svcount") %pn) nounwind { +; CHECK-LABEL: test_svcntp_c8_vlx4: +; CHECK: // %bb.0: +; CHECK-NEXT: cntp x0, pn0.b, vlx4 +; CHECK-NEXT: ret + %res = call i64 @llvm.aarch64.sve.cntp.c8(target("aarch64.svcount") %pn, i32 4) + ret i64 %res +} + +define i64 @test_svcntp_c16_vlx2(target("aarch64.svcount") %pn) nounwind { +; CHECK-LABEL: test_svcntp_c16_vlx2: +; CHECK: // %bb.0: +; CHECK-NEXT: cntp x0, pn0.h, vlx2 +; CHECK-NEXT: ret + %res = call i64 @llvm.aarch64.sve.cntp.c16(target("aarch64.svcount") %pn, i32 2) + ret i64 %res +} + +define i64 @test_svcntp_c16_vlx4(target("aarch64.svcount") %pn) nounwind { +; CHECK-LABEL: test_svcntp_c16_vlx4: +; CHECK: // %bb.0: +; CHECK-NEXT: cntp x0, pn0.h, vlx4 +; CHECK-NEXT: ret + %res = call i64 @llvm.aarch64.sve.cntp.c16(target("aarch64.svcount") %pn, i32 4) + ret i64 %res +} + +define i64 @test_svcntp_c32_vlx2(target("aarch64.svcount") %pn) nounwind { +; CHECK-LABEL: test_svcntp_c32_vlx2: +; CHECK: // %bb.0: +; CHECK-NEXT: cntp x0, pn0.s, vlx2 +; CHECK-NEXT: ret + %res = call i64 @llvm.aarch64.sve.cntp.c32(target("aarch64.svcount") %pn, i32 2) + ret i64 %res +} + +define i64 @test_svcntp_c32_vlx4(target("aarch64.svcount") %pn) nounwind { +; CHECK-LABEL: test_svcntp_c32_vlx4: +; CHECK: // %bb.0: +; CHECK-NEXT: cntp x0, pn0.s, vlx4 +; CHECK-NEXT: ret + %res = call i64 @llvm.aarch64.sve.cntp.c32(target("aarch64.svcount") %pn, i32 4) + ret i64 %res +} + +define i64 @test_svcntp_c64_vlx2(target("aarch64.svcount") %pn) nounwind { +; CHECK-LABEL: test_svcntp_c64_vlx2: +; CHECK: // %bb.0: +; CHECK-NEXT: cntp x0, pn0.d, vlx2 +; CHECK-NEXT: ret + %res = call i64 @llvm.aarch64.sve.cntp.c64(target("aarch64.svcount") %pn, i32 2) + ret i64 %res +} + +define i64 @test_svcntp_c64_vlx4(target("aarch64.svcount") %pn) nounwind { +; CHECK-LABEL: test_svcntp_c64_vlx4: +; CHECK: // %bb.0: +; CHECK-NEXT: cntp x0, pn0.d, vlx4 +; CHECK-NEXT: ret + %res = call i64 @llvm.aarch64.sve.cntp.c64(target("aarch64.svcount") %pn, i32 4) + ret i64 %res +} + + +declare i64 @llvm.aarch64.sve.cntp.c8(target("aarch64.svcount"), i32) +declare i64 @llvm.aarch64.sve.cntp.c16(target("aarch64.svcount"), i32) +declare i64 @llvm.aarch64.sve.cntp.c32(target("aarch64.svcount"), i32) +declare i64 @llvm.aarch64.sve.cntp.c64(target("aarch64.svcount"), i32) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-fclamp.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-fclamp.ll --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-fclamp.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-fclamp.ll @@ -3,8 +3,8 @@ target triple = "aarch64-linux-gnu" -define @test_fclamp_f16( %a, %b, %c) #0 { -; CHECK-LABEL: test_fclamp_f16: +define @test_fclamp_i16( %a, %b, %c) #0 { +; CHECK-LABEL: test_fclamp_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: fclamp z0.h, z1.h, z2.h ; CHECK-NEXT: ret @@ -12,8 +12,8 @@ ret %res } -define @test_fclamp_f32( %a, %b, %c) #0 { -; CHECK-LABEL: test_fclamp_f32: +define @test_fclamp_i32( %a, %b, %c) #0 { +; CHECK-LABEL: test_fclamp_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: fclamp z0.s, z1.s, z2.s ; CHECK-NEXT: ret @@ -21,8 +21,8 @@ ret %res } -define @test_fclamp_f64( %a, %b, %c) #0 { -; CHECK-LABEL: test_fclamp_f64: +define @test_fclamp_i64( %a, %b, %c) #0 { +; CHECK-LABEL: test_fclamp_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: fclamp z0.d, z1.d, z2.d ; CHECK-NEXT: ret @@ -30,8 +30,8 @@ ret %res } -define { , } @test_fclamp_single_x2_f16( %a, %b, %c, %d) #1 { -; CHECK-LABEL: test_fclamp_single_x2_f16: +define { , } @test_fclamp_single_x2_i16( %a, %b, %c, %d) #1 { +; CHECK-LABEL: test_fclamp_single_x2_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 @@ -41,8 +41,8 @@ ret { , } %res } -define { , } @test_fclamp_single_x2_f32( %a, %b, %c, %d) #1 { -; CHECK-LABEL: test_fclamp_single_x2_f32: +define { , } @test_fclamp_single_x2_i32( %a, %b, %c, %d) #1 { +; CHECK-LABEL: test_fclamp_single_x2_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 @@ -52,8 +52,8 @@ ret { , } %res } -define { , } @test_fclamp_single_x2_f64( %a, %b, %c, %d) #1 { -; CHECK-LABEL: test_fclamp_single_x2_f64: +define { , } @test_fclamp_single_x2_i64( %a, %b, %c, %d) #1 { +; CHECK-LABEL: test_fclamp_single_x2_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 @@ -64,8 +64,8 @@ } -define { , , , } @test_fclamp_single_x4_f16( %a, %b, %c, %d, %e, %f) #1 { -; CHECK-LABEL: test_fclamp_single_x4_f16: +define { , , , } @test_fclamp_single_x4_i16( %a, %b, %c, %d, %e, %f) #1 { +; CHECK-LABEL: test_fclamp_single_x4_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 @@ -77,8 +77,8 @@ ret { , , , } %res } -define { , , , } @test_fclamp_single_x4_f32( %a, %b, %c, %d, %e, %f) #1 { -; CHECK-LABEL: test_fclamp_single_x4_f32: +define { , , , } @test_fclamp_single_x4_i32( %a, %b, %c, %d, %e, %f) #1 { +; CHECK-LABEL: test_fclamp_single_x4_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 @@ -90,8 +90,8 @@ ret { , , , } %res } -define { , , , } @test_fclamp_single_x4_f64( %a, %b, %c, %d, %e, %f) #1 { -; CHECK-LABEL: test_fclamp_single_x4_f64: +define { , , , } @test_fclamp_single_x4_i64( %a, %b, %c, %d, %e, %f) #1 { +; CHECK-LABEL: test_fclamp_single_x4_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-loads.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-loads.ll @@ -0,0 +1,647 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+bf16 < %s | FileCheck %s + +; == Normal Multi-Vector Consecutive Loads == + +define { , } @ld1_vg2_i8(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_vg2_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr); + ret { , } %res +} + +define { , } @ld1_vg2_i16(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_vg2_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv8i16(target("aarch64.svcount") %pn, ptr %ptr); + ret { , } %res +} + +define { , } @ld1_vg2_i32(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_vg2_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1w { z0.s, z1.s }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv4i32(target("aarch64.svcount") %pn, ptr %ptr); + ret { , } %res +} + +define { , } @ld1_vg2_i64(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_vg2_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1d { z0.d, z1.d }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv2i64(target("aarch64.svcount") %pn, ptr %ptr); + ret { , } %res +} + +define { , } @ld1_vg2_f16(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_vg2_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv8f16(target("aarch64.svcount") %pn, ptr %ptr); + ret { , } %res +} + +define { , } @ld1_vg2_bf16(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_vg2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv8bf16(target("aarch64.svcount") %pn, ptr %ptr); + ret { , } %res +} + +define { , } @ld1_vg2_f32(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_vg2_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1w { z0.s, z1.s }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv4f32(target("aarch64.svcount") %pn, ptr %ptr); + ret { , } %res +} + +define { , } @ld1_vg2_f64(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_vg2_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1d { z0.d, z1.d }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv2f64(target("aarch64.svcount") %pn, ptr %ptr); + ret { , } %res +} + +; Test to ensure we load into the correct registers for the instruction +define @ld1_vg2_i8_z0_taken(target("aarch64.svcount") %pn, ptr %ptr, %val) { +; CHECK-LABEL: ld1_vg2_i8_z0_taken: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1b { z2.b, z3.b }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: add z0.b, z0.b, z2.b +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %ld1 = call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr); + %ld1_0 = extractvalue { , } %ld1, 0 + %res = add %val, %ld1_0 + ret %res +} + +define { , , , } @ld1_vg4_i8(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_vg4_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr); + ret { , , , } %res +} + +define { , , , } @ld1_vg4_i16(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_vg4_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1h { z0.h - z3.h }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv8i16(target("aarch64.svcount") %pn, ptr %ptr); + ret { , , , } %res +} + +define { , , , } @ld1_vg4_i32(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_vg4_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1w { z0.s - z3.s }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv4i32(target("aarch64.svcount") %pn, ptr %ptr); + ret { , , , } %res +} + +define { , , , } @ld1_vg4_i64(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_vg4_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1d { z0.d - z3.d }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv2i64(target("aarch64.svcount") %pn, ptr %ptr); + ret { , , , } %res +} + +define { , , , } @ld1_vg4_f16(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_vg4_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1h { z0.h - z3.h }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv8f16(target("aarch64.svcount") %pn, ptr %ptr); + ret { , , , } %res +} + +define { , , , } @ld1_vg4_bf16(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_vg4_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1h { z0.h - z3.h }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv8bf16(target("aarch64.svcount") %pn, ptr %ptr); + ret { , , , } %res +} + +define { , , , } @ld1_vg4_f32(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_vg4_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1w { z0.s - z3.s }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv4f32(target("aarch64.svcount") %pn, ptr %ptr); + ret { , , , } %res +} + +define { , , , } @ld1_vg4_f64(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_vg4_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1d { z0.d - z3.d }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv2f64(target("aarch64.svcount") %pn, ptr %ptr); + ret { , , , } %res +} + +; Test to ensure we load into the correct registers for the instruction +define @ld1_vg4_i16_z0_taken(target("aarch64.svcount") %pn, ptr %ptr, %val) { +; CHECK-LABEL: ld1_vg4_i16_z0_taken: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1h { z4.h - z7.h }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: add z0.h, z0.h, z4.h +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %ld1 = call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv8i16(target("aarch64.svcount") %pn, ptr %ptr); + %ld1_0 = extractvalue { , , , } %ld1, 0 + %res = add %val, %ld1_0 + ret %res +} + + +; == Non-temporal Multi-Vector Consecutive Loads == + +define { , } @ldnt1_vg2_i8(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ldnt1_vg2_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1b { z0.b, z1.b }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr); + ret { , } %res +} + +define { , } @ldnt1_vg2_i16(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ldnt1_vg2_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1h { z0.h, z1.h }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv8i16(target("aarch64.svcount") %pn, ptr %ptr); + ret { , } %res +} + +define { , } @ldnt1_vg2_i32(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ldnt1_vg2_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1w { z0.s, z1.s }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv4i32(target("aarch64.svcount") %pn, ptr %ptr); + ret { , } %res +} + +define { , } @ldnt1_vg2_i64(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ldnt1_vg2_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1d { z0.d, z1.d }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv2i64(target("aarch64.svcount") %pn, ptr %ptr); + ret { , } %res +} + +define { , } @ldnt1_vg2_f16(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ldnt1_vg2_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1h { z0.h, z1.h }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv8f16(target("aarch64.svcount") %pn, ptr %ptr); + ret { , } %res +} + +define { , } @ldnt1_vg2_bf16(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ldnt1_vg2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1h { z0.h, z1.h }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv8bf16(target("aarch64.svcount") %pn, ptr %ptr); + ret { , } %res +} + +define { , } @ldnt1_vg2_f32(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ldnt1_vg2_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1w { z0.s, z1.s }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv4f32(target("aarch64.svcount") %pn, ptr %ptr); + ret { , } %res +} + +define { , } @ldnt1_vg2_f64(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ldnt1_vg2_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1d { z0.d, z1.d }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv2f64(target("aarch64.svcount") %pn, ptr %ptr); + ret { , } %res +} + +; Test to ensure we load into the correct registers for the instruction +define @ldnt1_vg2_i32_z0_taken(target("aarch64.svcount") %pn, ptr %ptr, %val) { +; CHECK-LABEL: ldnt1_vg2_i32_z0_taken: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1w { z2.s, z3.s }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: add z0.s, z0.s, z2.s +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %ld1 = call { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv4i32(target("aarch64.svcount") %pn, ptr %ptr); + %ld1_0 = extractvalue { , } %ld1, 0 + %res = add %val, %ld1_0 + ret %res +} + +define { , , , } @ldnt1_vg4_i8(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ldnt1_vg4_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1b { z0.b - z3.b }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr); + ret { , , , } %res +} + +define { , , , } @ldnt1_vg4_i16(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ldnt1_vg4_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1h { z0.h - z3.h }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv8i16(target("aarch64.svcount") %pn, ptr %ptr); + ret { , , , } %res +} + +define { , , , } @ldnt1_vg4_i32(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ldnt1_vg4_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1w { z0.s - z3.s }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv4i32(target("aarch64.svcount") %pn, ptr %ptr); + ret { , , , } %res +} + +define { , , , } @ldnt1_vg4_i64(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ldnt1_vg4_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1d { z0.d - z3.d }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv2i64(target("aarch64.svcount") %pn, ptr %ptr); + ret { , , , } %res +} + +define { , , , } @ldnt1_vg4_f16(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ldnt1_vg4_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1h { z0.h - z3.h }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv8f16(target("aarch64.svcount") %pn, ptr %ptr); + ret { , , , } %res +} + +define { , , , } @ldnt1_vg4_bf16(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ldnt1_vg4_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1h { z0.h - z3.h }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv8bf16(target("aarch64.svcount") %pn, ptr %ptr); + ret { , , , } %res +} + +define { , , , } @ldnt1_vg4_f32(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ldnt1_vg4_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1w { z0.s - z3.s }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv4f32(target("aarch64.svcount") %pn, ptr %ptr); + ret { , , , } %res +} + +define { , , , } @ldnt1_vg4_f64(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ldnt1_vg4_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1d { z0.d - z3.d }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv2f64(target("aarch64.svcount") %pn, ptr %ptr); + ret { , , , } %res +} + +; Test to ensure we load into the correct registers for the instruction +define @ldnt1_vg4_i64_z0_taken(target("aarch64.svcount") %pn, ptr %ptr, %val) { +; CHECK-LABEL: ldnt1_vg4_i64_z0_taken: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1d { z4.d - z7.d }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: add z0.d, z0.d, z4.d +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %ld1 = call { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv2i64(target("aarch64.svcount") %pn, ptr %ptr); + %ld1_0 = extractvalue { , , , } %ld1, 0 + %res = add %val, %ld1_0 + ret %res +} + +declare { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv2i64(target("aarch64.svcount"), ptr) +declare { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv4i32(target("aarch64.svcount"), ptr) +declare { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv8i16(target("aarch64.svcount"), ptr) +declare { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv16i8(target("aarch64.svcount"), ptr) +declare { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv2f64(target("aarch64.svcount"), ptr) +declare { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv4f32(target("aarch64.svcount"), ptr) +declare { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv8f16(target("aarch64.svcount"), ptr) +declare { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv8bf16(target("aarch64.svcount"), ptr) + +declare { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv2i64(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv4i32(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv8i16(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv16i8(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv2f64(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv4f32(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv8f16(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv8bf16(target("aarch64.svcount"), ptr) + +declare { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv2i64(target("aarch64.svcount"), ptr) +declare { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv4i32(target("aarch64.svcount"), ptr) +declare { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv8i16(target("aarch64.svcount"), ptr) +declare { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv16i8(target("aarch64.svcount"), ptr) +declare { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv2f64(target("aarch64.svcount"), ptr) +declare { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv4f32(target("aarch64.svcount"), ptr) +declare { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv8f16(target("aarch64.svcount"), ptr) +declare { , } @llvm.aarch64.sve.ldnt1.pn.vg2.nxv8bf16(target("aarch64.svcount"), ptr) + +declare { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv2i64(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv4i32(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv8i16(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv16i8(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv2f64(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv4f32(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv8f16(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ldnt1.pn.vg4.nxv8bf16(target("aarch64.svcount"), ptr) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-predicate-as-counter.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-predicate-as-counter.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-predicate-as-counter.ll @@ -0,0 +1,185 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64 -mattr=+sve2p1 < %s | FileCheck %s + +define @pext_b(target("aarch64.svcount") %x) nounwind { +; CHECK-LABEL: pext_b: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: pext p0.b, pn8[2] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.pext.nxv16i1(target("aarch64.svcount") %x, i32 2) + ret %res +} + +define @pext_h(target("aarch64.svcount") %x) nounwind { +; CHECK-LABEL: pext_h: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: pext p0.h, pn8[2] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.pext.nxv8i1(target("aarch64.svcount") %x, i32 2) + ret %res +} + +define @pext_s(target("aarch64.svcount") %x) nounwind { +; CHECK-LABEL: pext_s: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: pext p0.s, pn8[2] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.pext.nxv4i1(target("aarch64.svcount") %x, i32 2) + ret %res +} + +define @pext_d(target("aarch64.svcount") %x) nounwind { +; CHECK-LABEL: pext_d: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: pext p0.d, pn8[2] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.pext.nxv2i1(target("aarch64.svcount") %x, i32 2) + ret %res +} + +declare @llvm.aarch64.sve.pext.nxv16i1(target("aarch64.svcount"), i32) +declare @llvm.aarch64.sve.pext.nxv8i1(target("aarch64.svcount"), i32) +declare @llvm.aarch64.sve.pext.nxv4i1(target("aarch64.svcount"), i32) +declare @llvm.aarch64.sve.pext.nxv2i1(target("aarch64.svcount"), i32) + +define {,} @pext_x2_b(target("aarch64.svcount") %x) nounwind { +; CHECK-LABEL: pext_x2_b: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: pext { p0.b, p1.b }, pn8[1] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call {,} @llvm.aarch64.sve.pext.x2.nxv16i1(target("aarch64.svcount") %x, i32 1) + ret {,} %res +} + +define {,} @pext_x2_h(target("aarch64.svcount") %x) nounwind { +; CHECK-LABEL: pext_x2_h: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: pext { p0.h, p1.h }, pn8[1] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call {,} @llvm.aarch64.sve.pext.x2.nxv8i1(target("aarch64.svcount") %x, i32 1) + ret {,} %res +} + +define {,} @pext_x2_s(target("aarch64.svcount") %x) nounwind { +; CHECK-LABEL: pext_x2_s: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: pext { p0.s, p1.s }, pn8[1] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call {,} @llvm.aarch64.sve.pext.x2.nxv4i1(target("aarch64.svcount") %x, i32 1) + ret {,} %res +} + +define {,} @pext_x2_d(target("aarch64.svcount") %x) nounwind { +; CHECK-LABEL: pext_x2_d: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: pext { p0.d, p1.d }, pn8[1] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call {,} @llvm.aarch64.sve.pext.x2.nxv2i1(target("aarch64.svcount") %x, i32 1) + ret {,} %res +} + +declare {,} @llvm.aarch64.sve.pext.x2.nxv16i1(target("aarch64.svcount"), i32) +declare {,} @llvm.aarch64.sve.pext.x2.nxv8i1(target("aarch64.svcount"), i32) +declare {,} @llvm.aarch64.sve.pext.x2.nxv4i1(target("aarch64.svcount"), i32) +declare {,} @llvm.aarch64.sve.pext.x2.nxv2i1(target("aarch64.svcount"), i32) + +define target("aarch64.svcount") @ptrue_b() nounwind { +; CHECK-LABEL: ptrue_b: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %res = call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + ret target("aarch64.svcount") %res +} + +define target("aarch64.svcount") @ptrue_h() nounwind { +; CHECK-LABEL: ptrue_h: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue pn8.h +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %res = call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c16() + ret target("aarch64.svcount") %res +} + +define target("aarch64.svcount") @ptrue_s() nounwind { +; CHECK-LABEL: ptrue_s: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue pn8.s +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %res = call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c32() + ret target("aarch64.svcount") %res +} + +define target("aarch64.svcount") @ptrue_d() nounwind { +; CHECK-LABEL: ptrue_d: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue pn8.d +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %res = call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c64() + ret target("aarch64.svcount") %res +} + +declare target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() +declare target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c16() +declare target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c32() +declare target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c64() diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-qrshr.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-qrshr.ll --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-qrshr.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-qrshr.ll @@ -1,27 +1,27 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+sme2 -mattr=+bf16 -verify-machineinstrs < %s | FileCheck %s ; ; S/UQRSHRN x2 ; -define @multi_vector_sat_shift_narrow_interleave_x2_s16( %unused, %zn1, %zn2) { +define @multi_vector_sat_shift_narrow_interleave_x2_s16( %zn1, %zn2) { ; CHECK-LABEL: multi_vector_sat_shift_narrow_interleave_x2_s16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z3.d, z2.d -; CHECK-NEXT: mov z2.d, z1.d -; CHECK-NEXT: sqrshrn z0.h, { z2.s, z3.s }, #16 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: sqrshrn z0.h, { z0.s, z1.s }, #16 ; CHECK-NEXT: ret %res = call @llvm.aarch64.sve.sqrshrn.x2.nxv8i16( %zn1, %zn2, i32 16) ret %res } -define @multi_vector_sat_shift_narrow_interleave_x2_u16( %unused, %zn1, %zn2) { +define @multi_vector_sat_shift_narrow_interleave_x2_u16( %zn1, %zn2) { ; CHECK-LABEL: multi_vector_sat_shift_narrow_interleave_x2_u16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z3.d, z2.d -; CHECK-NEXT: mov z2.d, z1.d -; CHECK-NEXT: uqrshrn z0.h, { z2.s, z3.s }, #16 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: uqrshrn z0.h, { z0.s, z1.s }, #16 ; CHECK-NEXT: ret %res = call @llvm.aarch64.sve.uqrshrn.x2.nxv8i16( %zn1, %zn2, i32 16) ret %res @@ -31,12 +31,12 @@ ; SQRSHRUN x2 ; -define @multi_vector_sat_shift_unsigned_narrow_interleave_x2_s16( %unused, %zn1, %zn2) { +define @multi_vector_sat_shift_unsigned_narrow_interleave_x2_s16( %zn1, %zn2) { ; CHECK-LABEL: multi_vector_sat_shift_unsigned_narrow_interleave_x2_s16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z3.d, z2.d -; CHECK-NEXT: mov z2.d, z1.d -; CHECK-NEXT: sqrshrun z0.h, { z2.s, z3.s }, #16 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: sqrshrun z0.h, { z0.s, z1.s }, #16 ; CHECK-NEXT: ret %res = call @llvm.aarch64.sve.sqrshrun.x2.nxv8i16( %zn1, %zn2, i32 16) ret %res diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-sclamp.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-sclamp.ll --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-sclamp.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-sclamp.ll @@ -6,8 +6,7 @@ define @test_sclamp_i8( %a, %b, %c) #0 { ; CHECK-LABEL: test_sclamp_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sclamp z2.b, z0.b, z1.b -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: sclamp z0.b, z1.b, z2.b ; CHECK-NEXT: ret %res = call @llvm.aarch64.sve.sclamp.nxv16i8( %a, %b, %c) ret %res @@ -16,8 +15,7 @@ define @test_sclamp_i16( %a, %b, %c) #0 { ; CHECK-LABEL: test_sclamp_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sclamp z2.h, z0.h, z1.h -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: sclamp z0.h, z1.h, z2.h ; CHECK-NEXT: ret %res = call @llvm.aarch64.sve.sclamp.nxv8i16( %a, %b, %c) ret %res @@ -26,8 +24,7 @@ define @test_sclamp_i32( %a, %b, %c) #0 { ; CHECK-LABEL: test_sclamp_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sclamp z2.s, z0.s, z1.s -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: sclamp z0.s, z1.s, z2.s ; CHECK-NEXT: ret %res = call @llvm.aarch64.sve.sclamp.nxv4i32( %a, %b, %c) ret %res @@ -36,8 +33,7 @@ define @test_sclamp_i64( %a, %b, %c) #0 { ; CHECK-LABEL: test_sclamp_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: sclamp z2.d, z0.d, z1.d -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: sclamp z0.d, z1.d, z2.d ; CHECK-NEXT: ret %res = call @llvm.aarch64.sve.sclamp.nxv2i64( %a, %b, %c) ret %res diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx2.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx2.ll @@ -0,0 +1,174 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s + +; == 8 to 64-bit elements == + +define { , } @sel_x2_i8(target("aarch64.svcount") %pn, %unused, %zn1, %zn2, %zm1, %zm2) nounwind { +; CHECK-LABEL: sel_x2_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z5.d, z4.d +; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z4.d, z3.d +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: sel { z0.b, z1.b }, pn8, { z6.b, z7.b }, { z4.b, z5.b } +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.sel.x2.nxv16i8(target("aarch64.svcount") %pn, %zn1, %zn2, %zm1, %zm2) + ret { , } %res +} + +define { , } @sel_x2_i16(target("aarch64.svcount") %pn, %unused, %zn1, %zn2, %zm1, %zm2) nounwind { +; CHECK-LABEL: sel_x2_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z5.d, z4.d +; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z4.d, z3.d +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: sel { z0.h, z1.h }, pn8, { z6.h, z7.h }, { z4.h, z5.h } +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.sel.x2.nxv8i16(target("aarch64.svcount") %pn, %zn1, %zn2, %zm1, %zm2) + ret { , } %res +} + +define { , } @sel_x2_f16(target("aarch64.svcount") %pn, %unused, %zn1, %zn2, %zm1, %zm2) nounwind { +; CHECK-LABEL: sel_x2_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z5.d, z4.d +; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z4.d, z3.d +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: sel { z0.h, z1.h }, pn8, { z6.h, z7.h }, { z4.h, z5.h } +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.sel.x2.nxv8f16(target("aarch64.svcount") %pn, %zn1, %zn2, %zm1, %zm2) + ret { , } %res +} + +define { , } @sel_x2_bf16(target("aarch64.svcount") %pn, %unused, %zn1, %zn2, %zm1, %zm2) nounwind { +; CHECK-LABEL: sel_x2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z5.d, z4.d +; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z4.d, z3.d +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: sel { z0.h, z1.h }, pn8, { z6.h, z7.h }, { z4.h, z5.h } +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.sel.x2.nxv8bf16(target("aarch64.svcount") %pn, %zn1, %zn2, %zm1, %zm2) + ret { , } %res +} + +define { , } @sel_x2_i32(target("aarch64.svcount") %pn, %unused, %zn1, %zn2, %zm1, %zm2) nounwind { +; CHECK-LABEL: sel_x2_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z5.d, z4.d +; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z4.d, z3.d +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: sel { z0.s, z1.s }, pn8, { z6.s, z7.s }, { z4.s, z5.s } +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.sel.x2.nxv4i32(target("aarch64.svcount") %pn, %zn1, %zn2, %zm1, %zm2) + ret { , } %res +} + +define { , } @sel_x2_f32(target("aarch64.svcount") %pn, %unused, %zn1, %zn2, %zm1, %zm2) nounwind { +; CHECK-LABEL: sel_x2_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z5.d, z4.d +; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z4.d, z3.d +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: sel { z0.s, z1.s }, pn8, { z6.s, z7.s }, { z4.s, z5.s } +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.sel.x2.nxv4f32(target("aarch64.svcount") %pn, %zn1, %zn2, %zm1, %zm2) + ret { , } %res +} + +define { , } @sel_x2_i64(target("aarch64.svcount") %pn, %unused, %zn1, %zn2, %zm1, %zm2) nounwind { +; CHECK-LABEL: sel_x2_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z5.d, z4.d +; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z4.d, z3.d +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: sel { z0.d, z1.d }, pn8, { z6.d, z7.d }, { z4.d, z5.d } +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.sel.x2.nxv2i64(target("aarch64.svcount") %pn, %zn1, %zn2, %zm1, %zm2) + ret { , } %res +} + +define { , } @sel_x2_f64(target("aarch64.svcount") %pn, %unused, %zn1, %zn2, %zm1, %zm2) nounwind { +; CHECK-LABEL: sel_x2_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z5.d, z4.d +; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z4.d, z3.d +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: sel { z0.d, z1.d }, pn8, { z6.d, z7.d }, { z4.d, z5.d } +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.sel.x2.nxv2f64(target("aarch64.svcount") %pn, %zn1, %zn2, %zm1, %zm2) + ret { , } %res +} + +; == 8 to 64-bit elements == +declare { , } @llvm.aarch64.sve.sel.x2.nxv16i8(target("aarch64.svcount") %pn, %zn1, %zn2, %zm1, %zm2) +declare { , } @llvm.aarch64.sve.sel.x2.nxv8i16(target("aarch64.svcount") %pn, %zn1, %zn2, %zm1, %zm2) +declare { , } @llvm.aarch64.sve.sel.x2.nxv4i32(target("aarch64.svcount") %pn, %zn1, %zn2, %zm1, %zm2) +declare { , } @llvm.aarch64.sve.sel.x2.nxv2i64(target("aarch64.svcount") %pn, %zn1, %zn2, %zm1, %zm2) +declare { , } @llvm.aarch64.sve.sel.x2.nxv8f16(target("aarch64.svcount") %pn, %zn1, %zn2, %zm1, %zm2) +declare { , } @llvm.aarch64.sve.sel.x2.nxv8bf16(target("aarch64.svcount") %pn, %zn1, %zn2, %zm1, %zm2) +declare { , } @llvm.aarch64.sve.sel.x2.nxv4f32(target("aarch64.svcount") %pn, %zn1, %zn2, %zm1, %zm2) +declare { , } @llvm.aarch64.sve.sel.x2.nxv2f64(target("aarch64.svcount") %pn, %zn1, %zn2, %zm1, %zm2) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll @@ -0,0 +1,215 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s + +; == 8 to 64-bit elements == + +define { , , , } @sel_x4_i8(target("aarch64.svcount") %pn, %unused, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) nounwind { +; CHECK-LABEL: sel_x4_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z26.d, z7.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: mov z24.d, z5.d +; CHECK-NEXT: mov z31.d, z4.d +; CHECK-NEXT: ld1b { z27.b }, p1/z, [x0] +; CHECK-NEXT: mov z30.d, z3.d +; CHECK-NEXT: mov z29.d, z2.d +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z28.d, z1.d +; CHECK-NEXT: sel { z0.b - z3.b }, pn8, { z28.b - z31.b }, { z24.b - z27.b } +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.sel.x4.nxv16i8(target("aarch64.svcount") %pn, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) + ret { , , , } %res +} + +define { , , , } @sel_x4_i16(target("aarch64.svcount") %pn, %unused, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) nounwind { +; CHECK-LABEL: sel_x4_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z26.d, z7.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: mov z24.d, z5.d +; CHECK-NEXT: mov z31.d, z4.d +; CHECK-NEXT: ld1h { z27.h }, p1/z, [x0] +; CHECK-NEXT: mov z30.d, z3.d +; CHECK-NEXT: mov z29.d, z2.d +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z28.d, z1.d +; CHECK-NEXT: sel { z0.h - z3.h }, pn8, { z28.h - z31.h }, { z24.h - z27.h } +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.sel.x4.nxv8i16(target("aarch64.svcount") %pn, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) + ret { , , , } %res +} + +define { , , , } @sel_x4_f16(target("aarch64.svcount") %pn, %unused, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) nounwind { +; CHECK-LABEL: sel_x4_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z26.d, z7.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: mov z24.d, z5.d +; CHECK-NEXT: mov z31.d, z4.d +; CHECK-NEXT: ld1h { z27.h }, p1/z, [x0] +; CHECK-NEXT: mov z30.d, z3.d +; CHECK-NEXT: mov z29.d, z2.d +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z28.d, z1.d +; CHECK-NEXT: sel { z0.h - z3.h }, pn8, { z28.h - z31.h }, { z24.h - z27.h } +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.sel.x4.nxv8f16(target("aarch64.svcount") %pn, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) + ret { , , , } %res +} + +define { , , , } @sel_x4_bf16(target("aarch64.svcount") %pn, %unused, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) nounwind { +; CHECK-LABEL: sel_x4_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z26.d, z7.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: mov z24.d, z5.d +; CHECK-NEXT: mov z31.d, z4.d +; CHECK-NEXT: ld1h { z27.h }, p1/z, [x0] +; CHECK-NEXT: mov z30.d, z3.d +; CHECK-NEXT: mov z29.d, z2.d +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z28.d, z1.d +; CHECK-NEXT: sel { z0.h - z3.h }, pn8, { z28.h - z31.h }, { z24.h - z27.h } +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.sel.x4.nxv8bf16(target("aarch64.svcount") %pn, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) + ret { , , , } %res +} + +define { , , , } @sel_x4_i32(target("aarch64.svcount") %pn, %unused, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) nounwind { +; CHECK-LABEL: sel_x4_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z26.d, z7.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: mov z24.d, z5.d +; CHECK-NEXT: mov z31.d, z4.d +; CHECK-NEXT: ld1w { z27.s }, p1/z, [x0] +; CHECK-NEXT: mov z30.d, z3.d +; CHECK-NEXT: mov z29.d, z2.d +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z28.d, z1.d +; CHECK-NEXT: sel { z0.s - z3.s }, pn8, { z28.s - z31.s }, { z24.s - z27.s } +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.sel.x4.nxv4i32(target("aarch64.svcount") %pn, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) + ret { , , , } %res +} + +define { , , , } @sel_x4_f32(target("aarch64.svcount") %pn, %unused, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) nounwind { +; CHECK-LABEL: sel_x4_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z26.d, z7.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: mov z24.d, z5.d +; CHECK-NEXT: mov z31.d, z4.d +; CHECK-NEXT: ld1w { z27.s }, p1/z, [x0] +; CHECK-NEXT: mov z30.d, z3.d +; CHECK-NEXT: mov z29.d, z2.d +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z28.d, z1.d +; CHECK-NEXT: sel { z0.s - z3.s }, pn8, { z28.s - z31.s }, { z24.s - z27.s } +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.sel.x4.nxv4f32(target("aarch64.svcount") %pn, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) + ret { , , , } %res +} + +define { , , , } @sel_x4_i64(target("aarch64.svcount") %pn, %unused, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) nounwind { +; CHECK-LABEL: sel_x4_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z26.d, z7.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: mov z24.d, z5.d +; CHECK-NEXT: mov z31.d, z4.d +; CHECK-NEXT: ld1d { z27.d }, p1/z, [x0] +; CHECK-NEXT: mov z30.d, z3.d +; CHECK-NEXT: mov z29.d, z2.d +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z28.d, z1.d +; CHECK-NEXT: sel { z0.d - z3.d }, pn8, { z28.d - z31.d }, { z24.d - z27.d } +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.sel.x4.nxv2i64(target("aarch64.svcount") %pn, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) + ret { , , , } %res +} + +define { , , , } @sel_x4_f64(target("aarch64.svcount") %pn, %unused, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) nounwind { +; CHECK-LABEL: sel_x4_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z26.d, z7.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: mov z24.d, z5.d +; CHECK-NEXT: mov z31.d, z4.d +; CHECK-NEXT: ld1d { z27.d }, p1/z, [x0] +; CHECK-NEXT: mov z30.d, z3.d +; CHECK-NEXT: mov z29.d, z2.d +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z28.d, z1.d +; CHECK-NEXT: sel { z0.d - z3.d }, pn8, { z28.d - z31.d }, { z24.d - z27.d } +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.sel.x4.nxv2f64(target("aarch64.svcount") %pn, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) + ret { , , , } %res +} + + +; == 8 to 64-bit elements == +declare { , , , } @llvm.aarch64.sve.sel.x4.nxv16i8(target("aarch64.svcount") %pn, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) +declare { , , , } @llvm.aarch64.sve.sel.x4.nxv8i16(target("aarch64.svcount") %pn, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) +declare { , , , } @llvm.aarch64.sve.sel.x4.nxv4i32(target("aarch64.svcount") %pn, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) +declare { , , , } @llvm.aarch64.sve.sel.x4.nxv2i64(target("aarch64.svcount") %pn, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) +declare { , , , } @llvm.aarch64.sve.sel.x4.nxv8f16(target("aarch64.svcount") %pn, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) +declare { , , , } @llvm.aarch64.sve.sel.x4.nxv8bf16(target("aarch64.svcount") %pn, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) +declare { , , , } @llvm.aarch64.sve.sel.x4.nxv4f32(target("aarch64.svcount") %pn, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) +declare { , , , } @llvm.aarch64.sve.sel.x4.nxv2f64(target("aarch64.svcount") %pn, %zn1, %zn2, %zn3, %zn4, %zm1, %zm2, %zm3, %zm4) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-stores.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-stores.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-stores.ll @@ -0,0 +1,731 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+bf16 < %s | FileCheck %s + +; == Normal Multi-Vector Consecutive Stores == + +define void @st1_vg2_i8( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: st1_vg2_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: st1b { z0.b, z1.b }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.pn.vg2.nxv16i8( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @st1_vg2_i16( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: st1_vg2_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: st1h { z0.h, z1.h }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.pn.vg2.nxv8i16( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @st1_vg2_i32( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: st1_vg2_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: st1w { z0.s, z1.s }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.pn.vg2.nxv4i32( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @st1_vg2_i64( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: st1_vg2_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: st1d { z0.d, z1.d }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.pn.vg2.nxv2i64( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @st1_vg2_f16( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: st1_vg2_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: st1h { z0.h, z1.h }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.pn.vg2.nxv8f16( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @st1_vg2_bf16( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: st1_vg2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: st1h { z0.h, z1.h }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.pn.vg2.nxv8bf16( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @st1_vg2_f32( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: st1_vg2_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: st1w { z0.s, z1.s }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.pn.vg2.nxv4f32( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @st1_vg2_f64( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: st1_vg2_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: st1d { z0.d, z1.d }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.pn.vg2.nxv2f64( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +; Test to show we choose the correct registers for the instruction +define void @st1_vg2_i16_swap_regs( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: st1_vg2_i16_swap_regs: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: st1h { z2.h, z3.h }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.pn.vg2.nxv8i16( %zn1, %zn0, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + + +define void @st1_vg4_i8( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: st1_vg4_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: st1b { z0.b - z3.b }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.pn.vg4.nxv16i8( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @st1_vg4_i16( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: st1_vg4_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: st1h { z0.h - z3.h }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.pn.vg4.nxv8i16( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @st1_vg4_i32( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: st1_vg4_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: st1w { z0.s - z3.s }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.pn.vg4.nxv4i32( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @st1_vg4_i64( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: st1_vg4_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: st1d { z0.d - z3.d }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.pn.vg4.nxv2i64( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @st1_vg4_f16( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: st1_vg4_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: st1h { z0.h - z3.h }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.pn.vg4.nxv8f16( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @st1_vg4_bf16( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: st1_vg4_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: st1h { z0.h - z3.h }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.pn.vg4.nxv8bf16( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @st1_vg4_f32( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: st1_vg4_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: st1w { z0.s - z3.s }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.pn.vg4.nxv4f32( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @st1_vg4_f64( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: st1_vg4_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: st1d { z0.d - z3.d }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.pn.vg4.nxv2f64( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +; Test to show we choose the correct registers for the instruction +define void @st1_vg4_i64_invert_regs( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: st1_vg4_i64_invert_regs: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z4.d, z3.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: mov z7.d, z0.d +; CHECK-NEXT: st1d { z4.d - z7.d }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.pn.vg4.nxv2i64( %zn3, %zn2, %zn1, %zn0, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +; == Non-temporal Multi-Vector Consecutive Loads == + +define void @stnt1_vg2_i8( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: stnt1_vg2_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: stnt1b { z0.b, z1.b }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv16i8( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @stnt1_vg2_i16( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: stnt1_vg2_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: stnt1h { z0.h, z1.h }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv8i16( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @stnt1_vg2_i32( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: stnt1_vg2_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: stnt1w { z0.s, z1.s }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv4i32( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @stnt1_vg2_i64( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: stnt1_vg2_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: stnt1d { z0.d, z1.d }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv2i64( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @stnt1_vg2_f16( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: stnt1_vg2_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: stnt1h { z0.h, z1.h }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv8f16( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @stnt1_vg2_bf16( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: stnt1_vg2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: stnt1h { z0.h, z1.h }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv8bf16( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @stnt1_vg2_f32( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: stnt1_vg2_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: stnt1w { z0.s, z1.s }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv4f32( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @stnt1_vg2_f64( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: stnt1_vg2_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: stnt1d { z0.d, z1.d }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv2f64( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +; Test to show we choose the correct registers for the instruction +define void @stnt1_vg2_i16_bad_reg_offset( %ignored, %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: stnt1_vg2_i16_bad_reg_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: stnt1h { z2.h, z3.h }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.pn.vg2.nxv8i16( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @stnt1_vg4_i8( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: stnt1_vg4_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: stnt1b { z0.b - z3.b }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv16i8( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @stnt1_vg4_i16( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: stnt1_vg4_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: stnt1h { z0.h - z3.h }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv8i16( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @stnt1_vg4_i32( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: stnt1_vg4_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: stnt1w { z0.s - z3.s }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv4i32( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @stnt1_vg4_i64( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: stnt1_vg4_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: stnt1d { z0.d - z3.d }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv2i64( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @stnt1_vg4_f16( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: stnt1_vg4_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: stnt1h { z0.h - z3.h }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv8f16( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @stnt1_vg4_bf16( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: stnt1_vg4_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: stnt1h { z0.h - z3.h }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv8bf16( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @stnt1_vg4_f32( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: stnt1_vg4_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: stnt1w { z0.s - z3.s }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv4f32( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @stnt1_vg4_f64( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: stnt1_vg4_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: stnt1d { z0.d - z3.d }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv2f64( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + + +; Test to show we choose the correct registers for the instruction +define void @stnt1_vg4_i32_rotated_regs( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: stnt1_vg4_i32_rotated_regs: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: mov z7.d, z0.d +; CHECK-NEXT: stnt1w { z4.s - z7.s }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.pn.vg4.nxv4i32( %zn1, %zn2, %zn3, %zn0, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +declare void @llvm.aarch64.sve.st1.pn.vg2.nxv16i8(, , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.st1.pn.vg2.nxv8i16(, , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.st1.pn.vg2.nxv4i32(, , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.st1.pn.vg2.nxv2i64(, , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.st1.pn.vg2.nxv8f16(, , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.st1.pn.vg2.nxv8bf16(, , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.st1.pn.vg2.nxv4f32(, , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.st1.pn.vg2.nxv2f64(, , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.stnt1.pn.vg2.nxv16i8(, , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.stnt1.pn.vg2.nxv8i16(, , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.stnt1.pn.vg2.nxv4i32(, , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.stnt1.pn.vg2.nxv2i64(, , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.stnt1.pn.vg2.nxv8f16(, , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.stnt1.pn.vg2.nxv8bf16(, , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.stnt1.pn.vg2.nxv4f32(, , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.stnt1.pn.vg2.nxv2f64(, , target("aarch64.svcount"), ptr) + + +declare void @llvm.aarch64.sve.st1.pn.vg4.nxv16i8(, , , , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.st1.pn.vg4.nxv8i16(, , , , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.st1.pn.vg4.nxv4i32(, , , , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.st1.pn.vg4.nxv2i64(, , , , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.st1.pn.vg4.nxv8f16(, , , , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.st1.pn.vg4.nxv8bf16(, , , , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.st1.pn.vg4.nxv4f32(, , , , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.st1.pn.vg4.nxv2f64(, , , , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.stnt1.pn.vg4.nxv16i8(, , , , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.stnt1.pn.vg4.nxv8i16(, , , , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.stnt1.pn.vg4.nxv4i32(, , , , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.stnt1.pn.vg4.nxv2i64(, , , , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.stnt1.pn.vg4.nxv8f16(, , , , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.stnt1.pn.vg4.nxv8bf16(, , , , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.stnt1.pn.vg4.nxv4f32(, , , , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.stnt1.pn.vg4.nxv2f64(, , , , target("aarch64.svcount"), ptr) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uclamp.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uclamp.ll --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uclamp.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uclamp.ll @@ -6,8 +6,7 @@ define @test_uclamp_i8( %a, %b, %c) #0 { ; CHECK-LABEL: test_uclamp_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: uclamp z2.b, z0.b, z1.b -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: uclamp z0.b, z1.b, z2.b ; CHECK-NEXT: ret %res = call @llvm.aarch64.sve.uclamp.nxv16i8( %a, %b, %c) ret %res @@ -16,8 +15,7 @@ define @test_uclamp_i16( %a, %b, %c) #0 { ; CHECK-LABEL: test_uclamp_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: uclamp z2.h, z0.h, z1.h -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: uclamp z0.h, z1.h, z2.h ; CHECK-NEXT: ret %res = call @llvm.aarch64.sve.uclamp.nxv8i16( %a, %b, %c) ret %res @@ -26,8 +24,7 @@ define @test_uclamp_i32( %a, %b, %c) #0 { ; CHECK-LABEL: test_uclamp_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: uclamp z2.s, z0.s, z1.s -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: uclamp z0.s, z1.s, z2.s ; CHECK-NEXT: ret %res = call @llvm.aarch64.sve.uclamp.nxv4i32( %a, %b, %c) ret %res @@ -36,8 +33,7 @@ define @test_uclamp_i64( %a, %b, %c) #0 { ; CHECK-LABEL: test_uclamp_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: uclamp z2.d, z0.d, z1.d -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: uclamp z0.d, z1.d, z2.d ; CHECK-NEXT: ret %res = call @llvm.aarch64.sve.uclamp.nxv2i64( %a, %b, %c) ret %res diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpx2.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpx2.ll --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpx2.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpx2.ll @@ -3,40 +3,31 @@ ; == 8 to 64-bit elements == -define @uzp_x2_i8( %zn, %zm) nounwind { +define { , } @uzp_x2_i8( %zn, %zm) nounwind { ; CHECK-LABEL: uzp_x2_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: uzp { z2.b, z3.b }, z0.b, z1.b -; CHECK-NEXT: add z0.b, z2.b, z0.b +; CHECK-NEXT: uzp { z0.b, z1.b }, z0.b, z1.b ; CHECK-NEXT: ret - %uzp = call { , } @llvm.aarch64.sve.uzp.x2.nxv16i8( %zn, %zm) - %uzp0 = extractvalue { , } %uzp, 0 - %add = add %uzp0, %zn - ret %add + %res = call { , } @llvm.aarch64.sve.uzp.x2.nxv16i8( %zn, %zm) + ret { , } %res } -define @uzp_x2_i16( %zn, %zm) nounwind { +define { , } @uzp_x2_i16( %zn, %zm) nounwind { ; CHECK-LABEL: uzp_x2_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: uzp { z2.h, z3.h }, z0.h, z1.h -; CHECK-NEXT: add z0.h, z2.h, z0.h +; CHECK-NEXT: uzp { z0.h, z1.h }, z0.h, z1.h ; CHECK-NEXT: ret - %uzp = call { , } @llvm.aarch64.sve.uzp.x2.nxv8i16( %zn, %zm) - %uzp0 = extractvalue { , } %uzp, 0 - %add = add %uzp0, %zn - ret %add + %res = call { , } @llvm.aarch64.sve.uzp.x2.nxv8i16( %zn, %zm) + ret { , } %res } -define @uzp_x2_f16( %zn, %zm) nounwind { +define { , } @uzp_x2_f16( %zn, %zm) nounwind { ; CHECK-LABEL: uzp_x2_f16: ; CHECK: // %bb.0: -; CHECK-NEXT: uzp { z2.h, z3.h }, z0.h, z1.h -; CHECK-NEXT: fadd z0.h, z2.h, z0.h +; CHECK-NEXT: uzp { z0.h, z1.h }, z0.h, z1.h ; CHECK-NEXT: ret - %uzp = call { , } @llvm.aarch64.sve.uzp.x2.nxv8f16( %zn, %zm) - %uzp0 = extractvalue { , } %uzp, 0 - %add = fadd %uzp0, %zn - ret %add + %res = call { , } @llvm.aarch64.sve.uzp.x2.nxv8f16( %zn, %zm) + ret { , } %res } define { , } @uzp_x2_bf16( %zn, %zm) nounwind { @@ -44,74 +35,56 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: uzp { z0.h, z1.h }, z0.h, z1.h ; CHECK-NEXT: ret - %uzp = call { , } @llvm.aarch64.sve.uzp.x2.nxv8bf16( %zn, %zm) - ret { , } %uzp + %res = call { , } @llvm.aarch64.sve.uzp.x2.nxv8bf16( %zn, %zm) + ret { , } %res } -define @uzp_x2_i32( %zn, %zm) nounwind { +define { , } @uzp_x2_i32( %zn, %zm) nounwind { ; CHECK-LABEL: uzp_x2_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: uzp { z2.s, z3.s }, z0.s, z1.s -; CHECK-NEXT: add z0.s, z2.s, z0.s +; CHECK-NEXT: uzp { z0.s, z1.s }, z0.s, z1.s ; CHECK-NEXT: ret - %uzp = call { , } @llvm.aarch64.sve.uzp.x2.nxv4i32( %zn, %zm) - %uzp0 = extractvalue { , } %uzp, 0 - %add = add %uzp0, %zn - ret %add + %res = call { , } @llvm.aarch64.sve.uzp.x2.nxv4i32( %zn, %zm) + ret { , } %res } -define @uzp_x2_f32( %zn, %zm) nounwind { +define { , } @uzp_x2_f32( %zn, %zm) nounwind { ; CHECK-LABEL: uzp_x2_f32: ; CHECK: // %bb.0: -; CHECK-NEXT: uzp { z2.s, z3.s }, z0.s, z1.s -; CHECK-NEXT: fadd z0.s, z2.s, z0.s +; CHECK-NEXT: uzp { z0.s, z1.s }, z0.s, z1.s ; CHECK-NEXT: ret - %uzp = call { , } @llvm.aarch64.sve.uzp.x2.nxv4f32( %zn, %zm) - %uzp0 = extractvalue { , } %uzp, 0 - %add = fadd %uzp0, %zn - ret %add + %res = call { , } @llvm.aarch64.sve.uzp.x2.nxv4f32( %zn, %zm) + ret { , } %res } -define @uzp_x2_i64( %zn, %zm) nounwind { +define { , } @uzp_x2_i64( %zn, %zm) nounwind { ; CHECK-LABEL: uzp_x2_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: uzp { z2.d, z3.d }, z0.d, z1.d -; CHECK-NEXT: add z0.d, z2.d, z0.d +; CHECK-NEXT: uzp { z0.d, z1.d }, z0.d, z1.d ; CHECK-NEXT: ret - %uzp = call { , } @llvm.aarch64.sve.uzp.x2.nxv2i64( %zn, %zm) - %uzp0 = extractvalue { , } %uzp, 0 - %add = add %uzp0, %zn - ret %add + %res = call { , } @llvm.aarch64.sve.uzp.x2.nxv2i64( %zn, %zm) + ret { , } %res } -define @uzp_x2_f64( %zn, %zm) nounwind { +define { , } @uzp_x2_f64( %zn, %zm) nounwind { ; CHECK-LABEL: uzp_x2_f64: ; CHECK: // %bb.0: -; CHECK-NEXT: uzp { z2.d, z3.d }, z0.d, z1.d -; CHECK-NEXT: fadd z0.d, z2.d, z0.d +; CHECK-NEXT: uzp { z0.d, z1.d }, z0.d, z1.d ; CHECK-NEXT: ret - %uzp = call { , } @llvm.aarch64.sve.uzp.x2.nxv2f64( %zn, %zm) - %uzp0 = extractvalue { , } %uzp, 0 - %add = fadd %uzp0, %zn - ret %add + %res = call { , } @llvm.aarch64.sve.uzp.x2.nxv2f64( %zn, %zm) + ret { , } %res } ; == 128-bit elements == -; NOTE: For the 128-bit case we only need to check the to -; ensure the tuple result starts at the correct register multiple. The other -; variants all test the same code path. -define @uzpq_x2_i8( %zn, %zm) nounwind { +define { , } @uzpq_x2_i8( %zn, %zm) nounwind { ; CHECK-LABEL: uzpq_x2_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: uzp { z2.q, z3.q }, z0.q, z1.q -; CHECK-NEXT: add z0.b, z2.b, z0.b +; CHECK-NEXT: uzp { z0.q, z1.q }, z0.q, z1.q ; CHECK-NEXT: ret - %uzp = call { , } @llvm.aarch64.sve.uzpq.x2.nxv16i8( %zn, %zm) - %uzp0 = extractvalue { , } %uzp, 0 - %add = add %uzp0, %zn - ret %add + %res = call { , } @llvm.aarch64.sve.uzpq.x2.nxv16i8( %zn, %zm) + ret { , } %res } define { , } @uzpq_x2_i16( %zn, %zm) nounwind { @@ -177,18 +150,9 @@ ret { , } %res } -define { , } @uzpq_x2_i8_not_tied( %unused, %zn, %zm) nounwind { -; CHECK-LABEL: uzpq_x2_i8_not_tied: -; CHECK: // %bb.0: -; CHECK-NEXT: uzp { z0.q, z1.q }, z1.q, z2.q -; CHECK-NEXT: ret - %res = call { , } @llvm.aarch64.sve.uzpq.x2.nxv16i8( %zn, %zm) - ret { , } %res -} - ; == 8 to 64-bit elements == -declare { , } @llvm.aarch64.sve.uzp.x2.nxv16i8( %zn, %zm) +declare { , } @llvm.aarch64.sve.uzp.x2.nxv16i8(, ) declare { , } @llvm.aarch64.sve.uzp.x2.nxv8i16( %zn, %zm) declare { , } @llvm.aarch64.sve.uzp.x2.nxv4i32( %zn, %zm) declare { , } @llvm.aarch64.sve.uzp.x2.nxv2i64( %zn, %zm) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpx4.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpx4.ll --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpx4.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpx4.ll @@ -3,105 +3,105 @@ ; == 8 to 64-bit elements == -define { , , , } @uzp_x4_i8( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +define { , , , } @uzp_x4_i8( %zn1, %zn2, %zn3, %zn4) nounwind { ; CHECK-LABEL: uzp_x4_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov z6.d, z3.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: uzp { z0.b - z3.b }, { z4.b - z7.b } +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.b - z3.b }, { z0.b - z3.b } ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sve.uzp.x4.nxv16i8( %zn1, %zn2, %zn3, %zn4) ret { , , , } %res } -define { , , , } @uzp_x4_i16( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +define { , , , } @uzp_x4_i16( %zn1, %zn2, %zn3, %zn4) nounwind { ; CHECK-LABEL: uzp_x4_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov z6.d, z3.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: uzp { z0.h - z3.h }, { z4.h - z7.h } +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.h - z3.h }, { z0.h - z3.h } ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sve.uzp.x4.nxv8i16( %zn1, %zn2, %zn3, %zn4) ret { , , , } %res } -define { , , , } @uzp_x4_f16( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +define { , , , } @uzp_x4_f16( %zn1, %zn2, %zn3, %zn4) nounwind { ; CHECK-LABEL: uzp_x4_f16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov z6.d, z3.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: uzp { z0.h - z3.h }, { z4.h - z7.h } +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.h - z3.h }, { z0.h - z3.h } ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sve.uzp.x4.nxv8f16( %zn1, %zn2, %zn3, %zn4) ret { , , , } %res } -define { , , , } @uzp_x4_bf16( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +define { , , , } @uzp_x4_bf16( %zn1, %zn2, %zn3, %zn4) nounwind { ; CHECK-LABEL: uzp_x4_bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov z6.d, z3.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: uzp { z0.h - z3.h }, { z4.h - z7.h } +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.h - z3.h }, { z0.h - z3.h } ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sve.uzp.x4.nxv8bf16( %zn1, %zn2, %zn3, %zn4) ret { , , , } %res } -define { , , , } @uzp_x4_i32( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +define { , , , } @uzp_x4_i32( %zn1, %zn2, %zn3, %zn4) nounwind { ; CHECK-LABEL: uzp_x4_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov z6.d, z3.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: uzp { z0.s - z3.s }, { z4.s - z7.s } +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.s - z3.s }, { z0.s - z3.s } ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sve.uzp.x4.nxv4i32( %zn1, %zn2, %zn3, %zn4) ret { , , , } %res } -define { , , , } @uzp_x4_f32( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +define { , , , } @uzp_x4_f32( %zn1, %zn2, %zn3, %zn4) nounwind { ; CHECK-LABEL: uzp_x4_f32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov z6.d, z3.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: uzp { z0.s - z3.s }, { z4.s - z7.s } +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.s - z3.s }, { z0.s - z3.s } ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sve.uzp.x4.nxv4f32( %zn1, %zn2, %zn3, %zn4) ret { , , , } %res } -define { , , , } @uzp_x4_i64( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +define { , , , } @uzp_x4_i64( %zn1, %zn2, %zn3, %zn4) nounwind { ; CHECK-LABEL: uzp_x4_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov z6.d, z3.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: uzp { z0.d - z3.d }, { z4.d - z7.d } +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.d - z3.d }, { z0.d - z3.d } ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sve.uzp.x4.nxv2i64( %zn1, %zn2, %zn3, %zn4) ret { , , , } %res } -define { , , , } @uzp_x4_f64( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +define { , , , } @uzp_x4_f64( %zn1, %zn2, %zn3, %zn4) nounwind { ; CHECK-LABEL: uzp_x4_f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z27.d, z5.d -; CHECK-NEXT: mov z26.d, z4.d -; CHECK-NEXT: mov z25.d, z3.d -; CHECK-NEXT: mov z24.d, z2.d -; CHECK-NEXT: uzp { z0.d - z3.d }, { z24.d - z27.d } +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.d - z3.d }, { z0.d - z3.d } ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sve.uzp.x4.nxv2f64( %zn1, %zn2, %zn3, %zn4) ret { , , , } %res @@ -110,105 +110,105 @@ ; == 128-bit elements == -define { , , , } @zipq_x4_i8( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +define { , , , } @zipq_x4_i8( %zn1, %zn2, %zn3, %zn4) nounwind { ; CHECK-LABEL: zipq_x4_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov z6.d, z3.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: uzp { z0.q - z3.q }, { z4.q - z7.q } +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.q - z3.q }, { z0.q - z3.q } ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv16i8( %zn1, %zn2, %zn3, %zn4) ret { , , , } %res } -define { , , , } @zipq_x4_i16( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +define { , , , } @zipq_x4_i16( %zn1, %zn2, %zn3, %zn4) nounwind { ; CHECK-LABEL: zipq_x4_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov z6.d, z3.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: uzp { z0.q - z3.q }, { z4.q - z7.q } +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.q - z3.q }, { z0.q - z3.q } ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv8i16( %zn1, %zn2, %zn3, %zn4) ret { , , , } %res } -define { , , , } @zipq_x4_f16( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +define { , , , } @zipq_x4_f16( %zn1, %zn2, %zn3, %zn4) nounwind { ; CHECK-LABEL: zipq_x4_f16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov z6.d, z3.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: uzp { z0.q - z3.q }, { z4.q - z7.q } +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.q - z3.q }, { z0.q - z3.q } ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv8f16( %zn1, %zn2, %zn3, %zn4) ret { , , , } %res } -define { , , , } @zipq_x4_bf16( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +define { , , , } @zipq_x4_bf16( %zn1, %zn2, %zn3, %zn4) nounwind { ; CHECK-LABEL: zipq_x4_bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov z6.d, z3.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: uzp { z0.q - z3.q }, { z4.q - z7.q } +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.q - z3.q }, { z0.q - z3.q } ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv8bf16( %zn1, %zn2, %zn3, %zn4) ret { , , , } %res } -define { , , , } @zipq_x4_i32( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +define { , , , } @zipq_x4_i32( %zn1, %zn2, %zn3, %zn4) nounwind { ; CHECK-LABEL: zipq_x4_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov z6.d, z3.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: uzp { z0.q - z3.q }, { z4.q - z7.q } +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.q - z3.q }, { z0.q - z3.q } ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv4i32( %zn1, %zn2, %zn3, %zn4) ret { , , , } %res } -define { , , , } @zipq_x4_f32( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +define { , , , } @zipq_x4_f32( %zn1, %zn2, %zn3, %zn4) nounwind { ; CHECK-LABEL: zipq_x4_f32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov z6.d, z3.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: uzp { z0.q - z3.q }, { z4.q - z7.q } +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.q - z3.q }, { z0.q - z3.q } ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv4f32( %zn1, %zn2, %zn3, %zn4) ret { , , , } %res } -define { , , , } @zipq_x4_i64( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +define { , , , } @zipq_x4_i64( %zn1, %zn2, %zn3, %zn4) nounwind { ; CHECK-LABEL: zipq_x4_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov z6.d, z3.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: uzp { z0.q - z3.q }, { z4.q - z7.q } +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.q - z3.q }, { z0.q - z3.q } ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv2i64( %zn1, %zn2, %zn3, %zn4) ret { , , , } %res } -define { , , , } @zipq_x4_f64( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +define { , , , } @zipq_x4_f64( %zn1, %zn2, %zn3, %zn4) nounwind { ; CHECK-LABEL: zipq_x4_f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z27.d, z5.d -; CHECK-NEXT: mov z26.d, z4.d -; CHECK-NEXT: mov z25.d, z3.d -; CHECK-NEXT: mov z24.d, z2.d -; CHECK-NEXT: uzp { z0.q - z3.q }, { z24.q - z27.q } +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.q - z3.q }, { z0.q - z3.q } ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv2f64( %zn1, %zn2, %zn3, %zn4) ret { , , , } %res diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-while-pn.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-while-pn.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-while-pn.ll @@ -0,0 +1,717 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s + + +; +; WHILEGE +; + +define target("aarch64.svcount") @whilege_c8_vl2(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilege_c8_vl2: +; CHECK: // %bb.0: +; CHECK-NEXT: whilege pn8.b, x0, x1, vlx2 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c8(i64 %a, i64 %b, i32 2) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilege_c8_vl4(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilege_c8_vl4: +; CHECK: // %bb.0: +; CHECK-NEXT: whilege pn8.b, x0, x1, vlx4 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c8(i64 %a, i64 %b, i32 4) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilege_c16_vl2(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilege_c16_vl2: +; CHECK: // %bb.0: +; CHECK-NEXT: whilege pn8.h, x0, x1, vlx2 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c16(i64 %a, i64 %b, i32 2) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilege_c16_vl4(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilege_c16_vl4: +; CHECK: // %bb.0: +; CHECK-NEXT: whilege pn8.h, x0, x1, vlx4 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c16(i64 %a, i64 %b, i32 4) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilege_c32_vl2(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilege_c32_vl2: +; CHECK: // %bb.0: +; CHECK-NEXT: whilege pn8.s, x0, x1, vlx2 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c32(i64 %a, i64 %b, i32 2) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilege_c32_vl4(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilege_c32_vl4: +; CHECK: // %bb.0: +; CHECK-NEXT: whilege pn8.s, x0, x1, vlx4 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c32(i64 %a, i64 %b, i32 4) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilege_c64_vl2(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilege_c64_vl2: +; CHECK: // %bb.0: +; CHECK-NEXT: whilege pn8.d, x0, x1, vlx2 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c64(i64 %a, i64 %b, i32 2) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilege_c64_vl4(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilege_c64_vl4: +; CHECK: // %bb.0: +; CHECK-NEXT: whilege pn8.d, x0, x1, vlx4 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c64(i64 %a, i64 %b, i32 4) + ret target("aarch64.svcount") %out +} + + +; +; WHILEGT +; + +define target("aarch64.svcount") @whilegt_c8_vl2(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilegt_c8_vl2: +; CHECK: // %bb.0: +; CHECK-NEXT: whilegt pn8.b, x0, x1, vlx2 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c8(i64 %a, i64 %b, i32 2) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilegt_c8_vl4(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilegt_c8_vl4: +; CHECK: // %bb.0: +; CHECK-NEXT: whilegt pn8.b, x0, x1, vlx4 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c8(i64 %a, i64 %b, i32 4) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilegt_c16_vl2(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilegt_c16_vl2: +; CHECK: // %bb.0: +; CHECK-NEXT: whilegt pn8.h, x0, x1, vlx2 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c16(i64 %a, i64 %b, i32 2) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilegt_c16_vl4(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilegt_c16_vl4: +; CHECK: // %bb.0: +; CHECK-NEXT: whilegt pn8.h, x0, x1, vlx4 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c16(i64 %a, i64 %b, i32 4) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilegt_c32_vl2(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilegt_c32_vl2: +; CHECK: // %bb.0: +; CHECK-NEXT: whilegt pn8.s, x0, x1, vlx2 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c32(i64 %a, i64 %b, i32 2) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilegt_c32_vl4(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilegt_c32_vl4: +; CHECK: // %bb.0: +; CHECK-NEXT: whilegt pn8.s, x0, x1, vlx4 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c32(i64 %a, i64 %b, i32 4) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilegt_c64_vl2(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilegt_c64_vl2: +; CHECK: // %bb.0: +; CHECK-NEXT: whilegt pn8.d, x0, x1, vlx2 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c64(i64 %a, i64 %b, i32 2) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilegt_c64_vl4(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilegt_c64_vl4: +; CHECK: // %bb.0: +; CHECK-NEXT: whilegt pn8.d, x0, x1, vlx4 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c64(i64 %a, i64 %b, i32 4) + ret target("aarch64.svcount") %out +} + + +; +; WHILEHI +; + +define target("aarch64.svcount") @whilehi_c8_vl2(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilehi_c8_vl2: +; CHECK: // %bb.0: +; CHECK-NEXT: whilehi pn8.b, x0, x1, vlx2 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c8(i64 %a, i64 %b, i32 2) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilehi_c8_vl4(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilehi_c8_vl4: +; CHECK: // %bb.0: +; CHECK-NEXT: whilehi pn8.b, x0, x1, vlx4 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c8(i64 %a, i64 %b, i32 4) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilehi_c16_vl2(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilehi_c16_vl2: +; CHECK: // %bb.0: +; CHECK-NEXT: whilehi pn8.h, x0, x1, vlx2 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c16(i64 %a, i64 %b, i32 2) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilehi_c16_vl4(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilehi_c16_vl4: +; CHECK: // %bb.0: +; CHECK-NEXT: whilehi pn8.h, x0, x1, vlx4 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c16(i64 %a, i64 %b, i32 4) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilehi_c32_vl2(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilehi_c32_vl2: +; CHECK: // %bb.0: +; CHECK-NEXT: whilehi pn8.s, x0, x1, vlx2 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c32(i64 %a, i64 %b, i32 2) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilehi_c32_vl4(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilehi_c32_vl4: +; CHECK: // %bb.0: +; CHECK-NEXT: whilehi pn8.s, x0, x1, vlx4 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c32(i64 %a, i64 %b, i32 4) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilehi_c64_vl2(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilehi_c64_vl2: +; CHECK: // %bb.0: +; CHECK-NEXT: whilehi pn8.d, x0, x1, vlx2 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c64(i64 %a, i64 %b, i32 2) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilehi_c64_vl4(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilehi_c64_vl4: +; CHECK: // %bb.0: +; CHECK-NEXT: whilehi pn8.d, x0, x1, vlx4 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c64(i64 %a, i64 %b, i32 4) + ret target("aarch64.svcount") %out +} + + +; +; WHILEHS +; + +define target("aarch64.svcount") @whilehs_c8_vl2(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilehs_c8_vl2: +; CHECK: // %bb.0: +; CHECK-NEXT: whilehs pn8.b, x0, x1, vlx2 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c8(i64 %a, i64 %b, i32 2) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilehs_c8_vl4(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilehs_c8_vl4: +; CHECK: // %bb.0: +; CHECK-NEXT: whilehs pn8.b, x0, x1, vlx4 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c8(i64 %a, i64 %b, i32 4) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilehs_c16_vl2(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilehs_c16_vl2: +; CHECK: // %bb.0: +; CHECK-NEXT: whilehs pn8.h, x0, x1, vlx2 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c16(i64 %a, i64 %b, i32 2) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilehs_c16_vl4(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilehs_c16_vl4: +; CHECK: // %bb.0: +; CHECK-NEXT: whilehs pn8.h, x0, x1, vlx4 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c16(i64 %a, i64 %b, i32 4) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilehs_c32_vl2(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilehs_c32_vl2: +; CHECK: // %bb.0: +; CHECK-NEXT: whilehs pn8.s, x0, x1, vlx2 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c32(i64 %a, i64 %b, i32 2) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilehs_c32_vl4(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilehs_c32_vl4: +; CHECK: // %bb.0: +; CHECK-NEXT: whilehs pn8.s, x0, x1, vlx4 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c32(i64 %a, i64 %b, i32 4) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilehs_c64_vl2(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilehs_c64_vl2: +; CHECK: // %bb.0: +; CHECK-NEXT: whilehs pn8.d, x0, x1, vlx2 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c64(i64 %a, i64 %b, i32 2) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilehs_c64_vl4(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilehs_c64_vl4: +; CHECK: // %bb.0: +; CHECK-NEXT: whilehs pn8.d, x0, x1, vlx4 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c64(i64 %a, i64 %b, i32 4) + ret target("aarch64.svcount") %out +} + + +; +; WHILELE +; + +define target("aarch64.svcount") @whilele_c8_vl2(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilele_c8_vl2: +; CHECK: // %bb.0: +; CHECK-NEXT: whilele pn8.b, x0, x1, vlx2 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c8(i64 %a, i64 %b, i32 2) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilele_c8_vl4(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilele_c8_vl4: +; CHECK: // %bb.0: +; CHECK-NEXT: whilele pn8.b, x0, x1, vlx4 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c8(i64 %a, i64 %b, i32 4) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilele_c16_vl2(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilele_c16_vl2: +; CHECK: // %bb.0: +; CHECK-NEXT: whilele pn8.h, x0, x1, vlx2 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c16(i64 %a, i64 %b, i32 2) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilele_c16_vl4(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilele_c16_vl4: +; CHECK: // %bb.0: +; CHECK-NEXT: whilele pn8.h, x0, x1, vlx4 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c16(i64 %a, i64 %b, i32 4) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilele_c32_vl2(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilele_c32_vl2: +; CHECK: // %bb.0: +; CHECK-NEXT: whilele pn8.s, x0, x1, vlx2 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c32(i64 %a, i64 %b, i32 2) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilele_c32_vl4(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilele_c32_vl4: +; CHECK: // %bb.0: +; CHECK-NEXT: whilele pn8.s, x0, x1, vlx4 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c32(i64 %a, i64 %b, i32 4) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilele_c64_vl2(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilele_c64_vl2: +; CHECK: // %bb.0: +; CHECK-NEXT: whilele pn8.d, x0, x1, vlx2 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c64(i64 %a, i64 %b, i32 2) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilele_c64_vl4(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilele_c64_vl4: +; CHECK: // %bb.0: +; CHECK-NEXT: whilele pn8.d, x0, x1, vlx4 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c64(i64 %a, i64 %b, i32 4) + ret target("aarch64.svcount") %out +} + + +; +; WHILELO +; + +define target("aarch64.svcount") @whilelo_c8_vl2(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilelo_c8_vl2: +; CHECK: // %bb.0: +; CHECK-NEXT: whilelo pn8.b, x0, x1, vlx2 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c8(i64 %a, i64 %b, i32 2) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilelo_c8_vl4(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilelo_c8_vl4: +; CHECK: // %bb.0: +; CHECK-NEXT: whilelo pn8.b, x0, x1, vlx4 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c8(i64 %a, i64 %b, i32 4) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilelo_c16_vl2(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilelo_c16_vl2: +; CHECK: // %bb.0: +; CHECK-NEXT: whilelo pn8.h, x0, x1, vlx2 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c16(i64 %a, i64 %b, i32 2) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilelo_c16_vl4(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilelo_c16_vl4: +; CHECK: // %bb.0: +; CHECK-NEXT: whilelo pn8.h, x0, x1, vlx4 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c16(i64 %a, i64 %b, i32 4) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilelo_c32_vl2(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilelo_c32_vl2: +; CHECK: // %bb.0: +; CHECK-NEXT: whilelo pn8.s, x0, x1, vlx2 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c32(i64 %a, i64 %b, i32 2) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilelo_c32_vl4(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilelo_c32_vl4: +; CHECK: // %bb.0: +; CHECK-NEXT: whilelo pn8.s, x0, x1, vlx4 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c32(i64 %a, i64 %b, i32 4) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilelo_c64_vl2(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilelo_c64_vl2: +; CHECK: // %bb.0: +; CHECK-NEXT: whilelo pn8.d, x0, x1, vlx2 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c64(i64 %a, i64 %b, i32 2) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilelo_c64_vl4(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilelo_c64_vl4: +; CHECK: // %bb.0: +; CHECK-NEXT: whilelo pn8.d, x0, x1, vlx4 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c64(i64 %a, i64 %b, i32 4) + ret target("aarch64.svcount") %out +} + + +; +; WHILELS +; + +define target("aarch64.svcount") @whilels_c8_vl2(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilels_c8_vl2: +; CHECK: // %bb.0: +; CHECK-NEXT: whilels pn8.b, x0, x1, vlx2 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c8(i64 %a, i64 %b, i32 2) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilels_c8_vl4(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilels_c8_vl4: +; CHECK: // %bb.0: +; CHECK-NEXT: whilels pn8.b, x0, x1, vlx4 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c8(i64 %a, i64 %b, i32 4) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilels_c16_vl2(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilels_c16_vl2: +; CHECK: // %bb.0: +; CHECK-NEXT: whilels pn8.h, x0, x1, vlx2 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c16(i64 %a, i64 %b, i32 2) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilels_c16_vl4(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilels_c16_vl4: +; CHECK: // %bb.0: +; CHECK-NEXT: whilels pn8.h, x0, x1, vlx4 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c16(i64 %a, i64 %b, i32 4) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilels_c32_vl2(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilels_c32_vl2: +; CHECK: // %bb.0: +; CHECK-NEXT: whilels pn8.s, x0, x1, vlx2 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c32(i64 %a, i64 %b, i32 2) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilels_c32_vl4(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilels_c32_vl4: +; CHECK: // %bb.0: +; CHECK-NEXT: whilels pn8.s, x0, x1, vlx4 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c32(i64 %a, i64 %b, i32 4) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilels_c64_vl2(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilels_c64_vl2: +; CHECK: // %bb.0: +; CHECK-NEXT: whilels pn8.d, x0, x1, vlx2 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c64(i64 %a, i64 %b, i32 2) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilels_c64_vl4(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilels_c64_vl4: +; CHECK: // %bb.0: +; CHECK-NEXT: whilels pn8.d, x0, x1, vlx4 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c64(i64 %a, i64 %b, i32 4) + ret target("aarch64.svcount") %out +} + + +; +; WHILELT +; + +define target("aarch64.svcount") @whilelt_c8_vl2(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilelt_c8_vl2: +; CHECK: // %bb.0: +; CHECK-NEXT: whilelt pn8.b, x0, x1, vlx2 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c8(i64 %a, i64 %b, i32 2) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilelt_c8_vl4(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilelt_c8_vl4: +; CHECK: // %bb.0: +; CHECK-NEXT: whilelt pn8.b, x0, x1, vlx4 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c8(i64 %a, i64 %b, i32 4) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilelt_c16_vl2(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilelt_c16_vl2: +; CHECK: // %bb.0: +; CHECK-NEXT: whilelt pn8.h, x0, x1, vlx2 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c16(i64 %a, i64 %b, i32 2) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilelt_c16_vl4(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilelt_c16_vl4: +; CHECK: // %bb.0: +; CHECK-NEXT: whilelt pn8.h, x0, x1, vlx4 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c16(i64 %a, i64 %b, i32 4) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilelt_c32_vl2(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilelt_c32_vl2: +; CHECK: // %bb.0: +; CHECK-NEXT: whilelt pn8.s, x0, x1, vlx2 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c32(i64 %a, i64 %b, i32 2) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilelt_c32_vl4(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilelt_c32_vl4: +; CHECK: // %bb.0: +; CHECK-NEXT: whilelt pn8.s, x0, x1, vlx4 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c32(i64 %a, i64 %b, i32 4) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilelt_c64_vl2(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilelt_c64_vl2: +; CHECK: // %bb.0: +; CHECK-NEXT: whilelt pn8.d, x0, x1, vlx2 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c64(i64 %a, i64 %b, i32 2) + ret target("aarch64.svcount") %out +} + +define target("aarch64.svcount") @whilelt_c64_vl4(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: whilelt_c64_vl4: +; CHECK: // %bb.0: +; CHECK-NEXT: whilelt pn8.d, x0, x1, vlx4 +; CHECK-NEXT: mov p0.b, p8.b +; CHECK-NEXT: ret + %out = call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c64(i64 %a, i64 %b, i32 4) + ret target("aarch64.svcount") %out +} + + +declare target("aarch64.svcount") @llvm.aarch64.sve.whilege.c8(i64, i64, i32) +declare target("aarch64.svcount") @llvm.aarch64.sve.whilege.c16(i64, i64, i32) +declare target("aarch64.svcount") @llvm.aarch64.sve.whilege.c32(i64, i64, i32) +declare target("aarch64.svcount") @llvm.aarch64.sve.whilege.c64(i64, i64, i32) +declare target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c8(i64, i64, i32) +declare target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c16(i64, i64, i32) +declare target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c32(i64, i64, i32) +declare target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c64(i64, i64, i32) +declare target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c8(i64, i64, i32) +declare target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c16(i64, i64, i32) +declare target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c32(i64, i64, i32) +declare target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c64(i64, i64, i32) +declare target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c8(i64, i64, i32) +declare target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c16(i64, i64, i32) +declare target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c32(i64, i64, i32) +declare target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c64(i64, i64, i32) +declare target("aarch64.svcount") @llvm.aarch64.sve.whilele.c8(i64, i64, i32) +declare target("aarch64.svcount") @llvm.aarch64.sve.whilele.c16(i64, i64, i32) +declare target("aarch64.svcount") @llvm.aarch64.sve.whilele.c32(i64, i64, i32) +declare target("aarch64.svcount") @llvm.aarch64.sve.whilele.c64(i64, i64, i32) +declare target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c8(i64, i64, i32) +declare target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c16(i64, i64, i32) +declare target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c32(i64, i64, i32) +declare target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c64(i64, i64, i32) +declare target("aarch64.svcount") @llvm.aarch64.sve.whilels.c8(i64, i64, i32) +declare target("aarch64.svcount") @llvm.aarch64.sve.whilels.c16(i64, i64, i32) +declare target("aarch64.svcount") @llvm.aarch64.sve.whilels.c32(i64, i64, i32) +declare target("aarch64.svcount") @llvm.aarch64.sve.whilels.c64(i64, i64, i32) +declare target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c8(i64, i64, i32) +declare target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c16(i64, i64, i32) +declare target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c32(i64, i64, i32) +declare target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c64(i64, i64, i32) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-while-pp.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-while-pp.ll --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-while-pp.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-while-pp.ll @@ -377,7 +377,7 @@ } -; Test that we get good code quality when using while in combination with other intrinsics +; Test that we get good code quality for code generated from ACLE builtins define @codegen_whilege_b16_x2(i64 noundef %op1, i64 noundef %op2) nounwind { ; CHECK-LABEL: codegen_whilege_b16_x2: diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-zipx2.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-zipx2.ll --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-zipx2.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-zipx2.ll @@ -3,40 +3,31 @@ ; == 8 to 64-bit elements == -define @zip_x2_i8( %zn, %zm) nounwind { +define { , } @zip_x2_i8( %zn, %zm) nounwind { ; CHECK-LABEL: zip_x2_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: zip { z2.b, z3.b }, z0.b, z1.b -; CHECK-NEXT: add z0.b, z2.b, z0.b +; CHECK-NEXT: zip { z0.b, z1.b }, z0.b, z1.b ; CHECK-NEXT: ret - %zip = call { , } @llvm.aarch64.sve.zip.x2.nxv16i8( %zn, %zm) - %zip0 = extractvalue { , } %zip, 0 - %add = add %zip0, %zn - ret %add + %res = call { , } @llvm.aarch64.sve.zip.x2.nxv16i8( %zn, %zm) + ret { , } %res } -define @zip_x2_i16( %zn, %zm) nounwind { +define { , } @zip_x2_i16( %zn, %zm) nounwind { ; CHECK-LABEL: zip_x2_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: zip { z2.h, z3.h }, z0.h, z1.h -; CHECK-NEXT: add z0.h, z2.h, z0.h +; CHECK-NEXT: zip { z0.h, z1.h }, z0.h, z1.h ; CHECK-NEXT: ret - %zip = call { , } @llvm.aarch64.sve.zip.x2.nxv8i16( %zn, %zm) - %zip0 = extractvalue { , } %zip, 0 - %add = add %zip0, %zn - ret %add + %res = call { , } @llvm.aarch64.sve.zip.x2.nxv8i16( %zn, %zm) + ret { , } %res } -define @zip_x2_f16( %zn, %zm) nounwind { +define { , } @zip_x2_f16( %zn, %zm) nounwind { ; CHECK-LABEL: zip_x2_f16: ; CHECK: // %bb.0: -; CHECK-NEXT: zip { z2.h, z3.h }, z0.h, z1.h -; CHECK-NEXT: fadd z0.h, z2.h, z0.h +; CHECK-NEXT: zip { z0.h, z1.h }, z0.h, z1.h ; CHECK-NEXT: ret - %zip = call { , } @llvm.aarch64.sve.zip.x2.nxv8f16( %zn, %zm) - %zip0 = extractvalue { , } %zip, 0 - %add = fadd %zip0, %zn - ret %add + %res = call { , } @llvm.aarch64.sve.zip.x2.nxv8f16( %zn, %zm) + ret { , } %res } define { , } @zip_x2_bf16( %zn, %zm) nounwind { @@ -44,74 +35,65 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: zip { z0.h, z1.h }, z0.h, z1.h ; CHECK-NEXT: ret - %zip = call { , } @llvm.aarch64.sve.zip.x2.nxv8bf16( %zn, %zm) - ret { , } %zip + %res = call { , } @llvm.aarch64.sve.zip.x2.nxv8bf16( %zn, %zm) + ret { , } %res } -define @zip_x2_i32( %zn, %zm) nounwind { +define { , } @zip_x2_i32( %zn, %zm) nounwind { ; CHECK-LABEL: zip_x2_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: zip { z2.s, z3.s }, z0.s, z1.s -; CHECK-NEXT: add z0.s, z2.s, z0.s +; CHECK-NEXT: zip { z0.s, z1.s }, z0.s, z1.s ; CHECK-NEXT: ret - %zip = call { , } @llvm.aarch64.sve.zip.x2.nxv4i32( %zn, %zm) - %zip0 = extractvalue { , } %zip, 0 - %add = add %zip0, %zn - ret %add + %res = call { , } @llvm.aarch64.sve.zip.x2.nxv4i32( %zn, %zm) + ret { , } %res } -define @zip_x2_f32( %zn, %zm) nounwind { +define { , } @zip_x2_f32( %zn, %zm) nounwind { ; CHECK-LABEL: zip_x2_f32: ; CHECK: // %bb.0: -; CHECK-NEXT: zip { z2.s, z3.s }, z0.s, z1.s -; CHECK-NEXT: fadd z0.s, z2.s, z0.s +; CHECK-NEXT: zip { z0.s, z1.s }, z0.s, z1.s ; CHECK-NEXT: ret - %zip = call { , } @llvm.aarch64.sve.zip.x2.nxv4f32( %zn, %zm) - %zip0 = extractvalue { , } %zip, 0 - %add = fadd %zip0, %zn - ret %add + %res = call { , } @llvm.aarch64.sve.zip.x2.nxv4f32( %zn, %zm) + ret { , } %res } -define @zip_x2_i64( %zn, %zm) nounwind { +define { , } @zip_x2_i64( %zn, %zm) nounwind { ; CHECK-LABEL: zip_x2_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: zip { z2.d, z3.d }, z0.d, z1.d -; CHECK-NEXT: add z0.d, z2.d, z0.d +; CHECK-NEXT: zip { z0.d, z1.d }, z0.d, z1.d ; CHECK-NEXT: ret - %zip = call { , } @llvm.aarch64.sve.zip.x2.nxv2i64( %zn, %zm) - %zip0 = extractvalue { , } %zip, 0 - %add = add %zip0, %zn - ret %add + %res = call { , } @llvm.aarch64.sve.zip.x2.nxv2i64( %zn, %zm) + ret { , } %res } -define @zip_x2_f64( %zn, %zm) nounwind { +define { , } @zip_x2_f64( %zn, %zm) nounwind { ; CHECK-LABEL: zip_x2_f64: ; CHECK: // %bb.0: -; CHECK-NEXT: zip { z2.d, z3.d }, z0.d, z1.d -; CHECK-NEXT: fadd z0.d, z2.d, z0.d +; CHECK-NEXT: zip { z0.d, z1.d }, z0.d, z1.d ; CHECK-NEXT: ret - %zip = call { , } @llvm.aarch64.sve.zip.x2.nxv2f64( %zn, %zm) - %zip0 = extractvalue { , } %zip, 0 - %add = fadd %zip0, %zn - ret %add + %res = call { , } @llvm.aarch64.sve.zip.x2.nxv2f64( %zn, %zm) + ret { , } %res +} + +define { , } @zip_x2_i8_not_tied( %unused, %zn, %zm) nounwind { +; CHECK-LABEL: zip_x2_i8_not_tied: +; CHECK: // %bb.0: +; CHECK-NEXT: zip { z0.b, z1.b }, z1.b, z2.b +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.zip.x2.nxv16i8( %zn, %zm) + ret { , } %res } ; == 128-bit elements == -; NOTE: For the 128-bit case we only need to check the to -; ensure the tuple result starts at the correct register multiple. The other -; variants all test the same code path. -define @zipq_x2_i8( %zn, %zm) nounwind { +define { , } @zipq_x2_i8( %zn, %zm) nounwind { ; CHECK-LABEL: zipq_x2_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: zip { z2.q, z3.q }, z0.q, z1.q -; CHECK-NEXT: add z0.b, z2.b, z0.b +; CHECK-NEXT: zip { z0.q, z1.q }, z0.q, z1.q ; CHECK-NEXT: ret - %zip = call { , } @llvm.aarch64.sve.zipq.x2.nxv16i8( %zn, %zm) - %zip0 = extractvalue { , } %zip, 0 - %add = add %zip0, %zn - ret %add + %res = call { , } @llvm.aarch64.sve.zipq.x2.nxv16i8( %zn, %zm) + ret { , } %res } define { , } @zipq_x2_i16( %zn, %zm) nounwind { diff --git a/llvm/test/CodeGen/AArch64/variant-pcs.ll b/llvm/test/CodeGen/AArch64/variant-pcs.ll --- a/llvm/test/CodeGen/AArch64/variant-pcs.ll +++ b/llvm/test/CodeGen/AArch64/variant-pcs.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -o - %s | FileCheck %s --check-prefix=CHECK-ASM --strict-whitespace -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -filetype=obj -o - %s \ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sme -o - %s | FileCheck %s --check-prefix=CHECK-ASM --strict-whitespace +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sme -filetype=obj -o - %s \ ; RUN: | llvm-readobj --symbols - | FileCheck %s --check-prefix=CHECK-OBJ define i32 @base_pcs() { @@ -49,3 +49,40 @@ ; CHECK-OBJ: Other [ (0x80) ret void } + +define void @variant_pcs_sme_streaming() "aarch64_pstate_sm_enabled" { +; CHECK-ASM: .variant_pcs variant_pcs_sme_streaming +; CHECK-ASM-NEXT: variant_pcs_sme_streaming: +; CHECK-OBJ-LABEL: Name: variant_pcs_sme_streaming +; CHECK-OBJ: Other [ (0x80) + ret void +} + +define void @variant_pcs_sme_streaming_compatible() "aarch64_pstate_sm_compatible" { +; CHECK-ASM: .variant_pcs variant_pcs_sme_streaming_compatible +; CHECK-ASM-NEXT: variant_pcs_sme_streaming_compatible: +; CHECK-OBJ-LABEL: Name: variant_pcs_sme_streaming_compatible +; CHECK-OBJ: Other [ (0x80) + ret void +} + +define void @variant_pcs_sme_shared_za() "aarch64_pstate_za_shared" { +; CHECK-ASM: .variant_pcs variant_pcs_sme_shared_za +; CHECK-ASM-NEXT: variant_pcs_sme_shared_za: +; CHECK-OBJ-LABEL: Name: variant_pcs_sme_shared_za +; CHECK-OBJ: Other [ (0x80) + ret void +} + +define void @variant_pcs_sme_new_za() "aarch64_pstate_za_new" { +; CHECK-ASM-NOT: .variant_pcs variant_pcs_sme_new_za + ret void +} + +define void @variant_pcs_sme_preserves_za() "aarch64_pstate_za_shared" "aarch64_pstate_za_preserved" { +; CHECK-ASM: .variant_pcs variant_pcs_sme_preserves_za +; CHECK-ASM-NEXT: variant_pcs_sme_preserves_za: +; CHECK-OBJ-LABEL: Name: variant_pcs_sme_preserves_za +; CHECK-OBJ: Other [ (0x80) + ret void +} diff --git a/llvm/test/MC/AArch64/SVE2p1/feature-sve2p1-implies-bf16.s b/llvm/test/MC/AArch64/SVE2p1/feature-sve2p1-implies-bf16.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/AArch64/SVE2p1/feature-sve2p1-implies-bf16.s @@ -0,0 +1,7 @@ +// This test verifies SVE2p1 implies bf16. + +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p1 2>&1 < %s \ +// RUN: | FileCheck %s + +bfcvt z0.h, p0/m, z1.s +// CHECK: bfcvt z0.h, p0/m, z1.s \ No newline at end of file diff --git a/llvm/test/Transforms/Inline/AArch64/sme-streaming-attr.ll b/llvm/test/Transforms/Inline/AArch64/sme-streaming-attr.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/Inline/AArch64/sme-streaming-attr.ll @@ -0,0 +1,339 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mtriple=aarch64-unknown-linux-gnu -mattr=+sme -S -passes=inline | FileCheck %s + +declare void @inlined_streaming_body() "aarch64_pstate_sm_enabled"; +declare void @inlined_locally_streaming_body() "aarch64_pstate_sm_body"; +declare void @inlined_streaming_compatible_body() "aarch64_pstate_sm_compatible"; +declare void @inlined_streaming_compatible_locally_streaming_body() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body"; +declare void @inlined_normal_body(); + +define void @streaming_callee() "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: @streaming_callee( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_streaming_body() +; CHECK-NEXT: ret void +; +entry: + call void @inlined_streaming_body() + ret void +} + +define void @locally_streaming_callee() "aarch64_pstate_sm_body" { +; CHECK-LABEL: @locally_streaming_callee( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_locally_streaming_body() +; CHECK-NEXT: ret void +; +entry: + call void @inlined_locally_streaming_body() + ret void +} + +define void @streaming_compatible_callee() "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: @streaming_compatible_callee( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_streaming_compatible_body() +; CHECK-NEXT: ret void +; +entry: + call void @inlined_streaming_compatible_body() + ret void +} + +define void @streaming_compatible_locally_streaming_callee() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { +; CHECK-LABEL: @streaming_compatible_locally_streaming_callee( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_streaming_compatible_locally_streaming_body() +; CHECK-NEXT: ret void +; +entry: + call void @inlined_streaming_compatible_locally_streaming_body() + ret void +} + +define void @normal_callee() { +; CHECK-LABEL: @normal_callee( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_normal_body() +; CHECK-NEXT: ret void +; +entry: + call void @inlined_normal_body() + ret void +} + +define void @streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: @streaming_caller_streaming_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_streaming_body() +; CHECK-NEXT: ret void +; +entry: + call void @streaming_callee() + ret void +} + +define void @streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: @streaming_caller_locally_streaming_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_locally_streaming_body() +; CHECK-NEXT: ret void +; +entry: + call void @locally_streaming_callee() + ret void +} + +define void @streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: @streaming_caller_streaming_compatible_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_streaming_compatible_body() +; CHECK-NEXT: ret void +; +entry: + call void @streaming_compatible_callee() + ret void +} + +define void @streaming_caller_streaming_compatible_locally_streaming_callee_inline() "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: @streaming_caller_streaming_compatible_locally_streaming_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_streaming_compatible_locally_streaming_body() +; CHECK-NEXT: ret void +; +entry: + call void @streaming_compatible_locally_streaming_callee() + ret void +} + +define void @streaming_caller_normal_callee_dont_inline() "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: @streaming_caller_normal_callee_dont_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @normal_callee() +; CHECK-NEXT: ret void +; +entry: + call void @normal_callee() + ret void +} + +define void @locally_streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_body" { +; CHECK-LABEL: @locally_streaming_caller_streaming_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_streaming_body() +; CHECK-NEXT: ret void +; +entry: + call void @streaming_callee() + ret void +} + +define void @locally_streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_body" { +; CHECK-LABEL: @locally_streaming_caller_locally_streaming_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_locally_streaming_body() +; CHECK-NEXT: ret void +; +entry: + call void @locally_streaming_callee() + ret void +} + +define void @locally_streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_body" { +; CHECK-LABEL: @locally_streaming_caller_streaming_compatible_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_streaming_compatible_body() +; CHECK-NEXT: ret void +; +entry: + call void @streaming_compatible_callee() + ret void +} + +define void @locally_streaming_caller_streaming_compatible_locally_streaming_callee_inline() "aarch64_pstate_sm_body" { +; CHECK-LABEL: @locally_streaming_caller_streaming_compatible_locally_streaming_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_streaming_compatible_locally_streaming_body() +; CHECK-NEXT: ret void +; +entry: + call void @streaming_compatible_locally_streaming_callee() + ret void +} + +define void @locally_streaming_caller_normal_callee_dont_inline() "aarch64_pstate_sm_body" { +; CHECK-LABEL: @locally_streaming_caller_normal_callee_dont_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @normal_callee() +; CHECK-NEXT: ret void +; +entry: + call void @normal_callee() + ret void +} + +define void @streaming_compatible_caller_streaming_callee_dont_inline() "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: @streaming_compatible_caller_streaming_callee_dont_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @streaming_callee() +; CHECK-NEXT: ret void +; +entry: + call void @streaming_callee() + ret void +} + +define void @streaming_compatible_caller_locally_streaming_callee_dont_inline() "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: @streaming_compatible_caller_locally_streaming_callee_dont_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @locally_streaming_callee() +; CHECK-NEXT: ret void +; +entry: + call void @locally_streaming_callee() + ret void +} + +define void @streaming_compatible_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: @streaming_compatible_caller_streaming_compatible_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_streaming_compatible_body() +; CHECK-NEXT: ret void +; +entry: + call void @streaming_compatible_callee() + ret void +} + +define void @streaming_compatible_caller_streaming_compatible_locally_streaming_callee_dont_inline() "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: @streaming_compatible_caller_streaming_compatible_locally_streaming_callee_dont_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @streaming_compatible_locally_streaming_callee() +; CHECK-NEXT: ret void +; +entry: + call void @streaming_compatible_locally_streaming_callee() + ret void +} + +define void @streaming_compatible_caller_normal_callee_dont_inline() "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: @streaming_compatible_caller_normal_callee_dont_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @normal_callee() +; CHECK-NEXT: ret void +; +entry: + call void @normal_callee() + ret void +} + +define void @normal_caller_streaming_callee_dont_inline() { +; CHECK-LABEL: @normal_caller_streaming_callee_dont_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @streaming_callee() +; CHECK-NEXT: ret void +; +entry: + call void @streaming_callee() + ret void +} + +define void @normal_caller_locally_streaming_callee_dont_inline() { +; CHECK-LABEL: @normal_caller_locally_streaming_callee_dont_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @locally_streaming_callee() +; CHECK-NEXT: ret void +; +entry: + call void @locally_streaming_callee() + ret void +} + +define void @normal_caller_streaming_compatible_callee_inline() { +; CHECK-LABEL: @normal_caller_streaming_compatible_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_streaming_compatible_body() +; CHECK-NEXT: ret void +; +entry: + call void @streaming_compatible_callee() + ret void +} + +define void @normal_caller_streaming_compatible_locally_streaming_callee_dont_inline() { +; CHECK-LABEL: @normal_caller_streaming_compatible_locally_streaming_callee_dont_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @streaming_compatible_locally_streaming_callee() +; CHECK-NEXT: ret void +; +entry: + call void @streaming_compatible_locally_streaming_callee() + ret void +} + +define void @normal_caller_normal_callee_inline() { +; CHECK-LABEL: @normal_caller_normal_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_normal_body() +; CHECK-NEXT: ret void +; +entry: + call void @normal_callee() + ret void +} + + +define void @streaming_compatible_locally_streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { +; CHECK-LABEL: @streaming_compatible_locally_streaming_caller_streaming_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_streaming_body() +; CHECK-NEXT: ret void +; +entry: + call void @streaming_callee() + ret void +} + +define void @streaming_compatible_locally_streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { +; CHECK-LABEL: @streaming_compatible_locally_streaming_caller_locally_streaming_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_locally_streaming_body() +; CHECK-NEXT: ret void +; +entry: + call void @locally_streaming_callee() + ret void +} + +define void @streaming_compatible_locally_streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { +; CHECK-LABEL: @streaming_compatible_locally_streaming_caller_streaming_compatible_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_streaming_compatible_body() +; CHECK-NEXT: ret void +; +entry: + call void @streaming_compatible_callee() + ret void +} + +define void @streaming_compatible_locally_streaming_caller_and_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { +; CHECK-LABEL: @streaming_compatible_locally_streaming_caller_and_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_streaming_compatible_locally_streaming_body() +; CHECK-NEXT: ret void +; +entry: + call void @streaming_compatible_locally_streaming_callee() + ret void +} + +define void @streaming_compatible_locally_streaming_caller_normal_callee_dont_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { +; CHECK-LABEL: @streaming_compatible_locally_streaming_caller_normal_callee_dont_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @normal_callee() +; CHECK-NEXT: ret void +; +entry: + call void @normal_callee() + ret void +} diff --git a/llvm/test/Transforms/Inline/AArch64/sme-za-attr.ll b/llvm/test/Transforms/Inline/AArch64/sme-za-attr.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/Inline/AArch64/sme-za-attr.ll @@ -0,0 +1,125 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=aarch64-unknown-linux-gnu -mattr=+sme -S -passes=inline < %s | FileCheck %s + +declare void @inlined_normal_body(); +declare void @inlined_shared_za_body() "aarch64_pstate_za_shared"; +declare void @inlined_new_za_body() "aarch64_pstate_za_new"; + +define void @normal_callee() { +; CHECK-LABEL: @normal_callee( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_normal_body() +; CHECK-NEXT: ret void +; +entry: + call void @inlined_normal_body() + ret void +} + +define void @shared_za_callee() "aarch64_pstate_za_shared" { +; CHECK-LABEL: @shared_za_callee( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_shared_za_body() +; CHECK-NEXT: ret void +; +entry: + call void @inlined_shared_za_body() + ret void +} + +define void @new_za_callee() "aarch64_pstate_za_new" { +; CHECK-LABEL: @new_za_callee( +; CHECK-NEXT: call void @inlined_new_za_body() +; CHECK-NEXT: ret void +; + call void @inlined_new_za_body() + ret void +} + +define void @normal_caller_normal_callee_inline() { +; CHECK-LABEL: @normal_caller_normal_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_normal_body() +; CHECK-NEXT: ret void +; +entry: + call void @normal_callee() + ret void +} + +define void @normal_caller_new_za_callee_dont_inline() { +; CHECK-LABEL: @normal_caller_new_za_callee_dont_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @new_za_callee() +; CHECK-NEXT: ret void +; +entry: + call void @new_za_callee() + ret void +} + +define void @new_za_caller_normal_callee_dont_inline() "aarch64_pstate_za_new" { +; CHECK-LABEL: @new_za_caller_normal_callee_dont_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @normal_callee() +; CHECK-NEXT: ret void +; +entry: + call void @normal_callee() + ret void +} + +define void @new_za_caller_shared_za_callee_inline() "aarch64_pstate_za_new" { +; CHECK-LABEL: @new_za_caller_shared_za_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_shared_za_body() +; CHECK-NEXT: ret void +; +entry: + call void @shared_za_callee() + ret void +} + +define void @new_za_caller_new_za_callee_dont_inline() "aarch64_pstate_za_new" { +; CHECK-LABEL: @new_za_caller_new_za_callee_dont_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @new_za_callee() +; CHECK-NEXT: ret void +; +entry: + call void @new_za_callee() + ret void +} + +define void @shared_za_caller_normal_callee_dont_inline() "aarch64_pstate_za_shared" { +; CHECK-LABEL: @shared_za_caller_normal_callee_dont_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @normal_callee() +; CHECK-NEXT: ret void +; +entry: + call void @normal_callee() + ret void +} + +define void @shared_za_caller_shared_za_callee_inline() "aarch64_pstate_za_shared" { +; CHECK-LABEL: @shared_za_caller_shared_za_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_shared_za_body() +; CHECK-NEXT: ret void +; +entry: + call void @shared_za_callee() + ret void +} + +define void @shared_za_caller_new_za_callee_dont_inline() "aarch64_pstate_za_shared" { +; CHECK-LABEL: @shared_za_caller_new_za_callee_dont_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @new_za_callee() +; CHECK-NEXT: ret void +; +entry: + call void @new_za_callee() + ret void +}