diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -2681,6 +2681,8 @@ bool isSVEBool() const { return getKind() == Kind::SveBool; } + bool isSVECount() const { return getKind() == Kind::SveCount; } + /// Determines whether the given kind corresponds to a placeholder type. static bool isPlaceholderTypeKind(Kind K) { return K >= Overload; @@ -3910,6 +3912,20 @@ /// because TrailingObjects cannot handle repeated types. struct ExceptionType { QualType Type; }; + /// The AArch64 SME ACLE (Arm C/C++ Language Extensions) define a number + /// of function type attributes that can be set on function types, including + /// function pointers. + enum AArch64SMETypeAttributes : unsigned { + SME_NormalFunction = 0, + SME_PStateSMEnabledMask = 1 << 0, + SME_PStateSMCompatibleMask = 1 << 1, + SME_PStateZANewMask = 1 << 2, + SME_PStateZASharedMask = 1 << 3, + SME_PStateZAPreservedMask = 1 << 4, + SME_AttributeMask = 255 // We only support maximum 8 bits because of the + // bitmask in FunctionTypeExtraBitfields + }; + /// A simple holder for various uncommon bits which do not fit in /// FunctionTypeBitfields. Aligned to alignof(void *) to maintain the /// alignment of subsequent objects in TrailingObjects. @@ -3918,6 +3934,13 @@ /// A whole unsigned is not needed here and according to /// [implimits] 8 bits would be enough here. unsigned NumExceptionType = 0; + + /// Any AArch64 SME ACLE type attributes that need to be propagated + /// on declarations and function pointers. + unsigned AArch64SMEAttributes : 8; + + FunctionTypeExtraBitfields() + : AArch64SMEAttributes(SME_NormalFunction) {} }; protected: @@ -4098,16 +4121,20 @@ FunctionType::ExtInfo ExtInfo; bool Variadic : 1; bool HasTrailingReturn : 1; + unsigned AArch64SMEAttributes : 8; Qualifiers TypeQuals; RefQualifierKind RefQualifier = RQ_None; ExceptionSpecInfo ExceptionSpec; const ExtParameterInfo *ExtParameterInfos = nullptr; SourceLocation EllipsisLoc; - ExtProtoInfo() : Variadic(false), HasTrailingReturn(false) {} + ExtProtoInfo() + : Variadic(false), HasTrailingReturn(false), + AArch64SMEAttributes(SME_NormalFunction) {} ExtProtoInfo(CallingConv CC) - : ExtInfo(CC), Variadic(false), HasTrailingReturn(false) {} + : ExtInfo(CC), Variadic(false), HasTrailingReturn(false), + AArch64SMEAttributes(SME_NormalFunction) {} ExtProtoInfo withExceptionSpec(const ExceptionSpecInfo &ESI) { ExtProtoInfo Result(*this); @@ -4116,7 +4143,12 @@ } bool requiresFunctionProtoTypeExtraBitfields() const { - return ExceptionSpec.Type == EST_Dynamic; + return ExceptionSpec.Type == EST_Dynamic || + AArch64SMEAttributes != SME_NormalFunction; + } + + void setArmSMEAttribute(AArch64SMETypeAttributes Kind) { + AArch64SMEAttributes |= Kind; } }; @@ -4243,6 +4275,7 @@ EPI.TypeQuals = getMethodQuals(); EPI.RefQualifier = getRefQualifier(); EPI.ExtParameterInfos = getExtParameterInfosOrNull(); + EPI.AArch64SMEAttributes = getAArch64SMEAttributes(); return EPI; } @@ -4424,6 +4457,14 @@ return getTrailingObjects(); } + /// Return a bitmask describing the SME attributes on the function type, see + /// AArch64SMETypeAttributes for their values. + unsigned getAArch64SMEAttributes() const { + if (!hasExtraBitfields()) + return SME_NormalFunction; + return getTrailingObjects()->AArch64SMEAttributes; + } + ExtParameterInfo getExtParameterInfo(unsigned I) const { assert(I < getNumParams() && "parameter index out of range"); if (hasExtParameterInfos()) diff --git a/clang/include/clang/AST/TypeProperties.td b/clang/include/clang/AST/TypeProperties.td --- a/clang/include/clang/AST/TypeProperties.td +++ b/clang/include/clang/AST/TypeProperties.td @@ -323,6 +323,9 @@ ? node->getExtParameterInfos() : llvm::ArrayRef() }]; } + def : Property<"AArch64SMEAttributes", UInt32> { + let Read = [{ node->getAArch64SMEAttributes() }]; + } def : Creator<[{ auto extInfo = FunctionType::ExtInfo(noReturn, hasRegParm, regParm, @@ -338,6 +341,7 @@ epi.ExceptionSpec = exceptionSpecifier; epi.ExtParameterInfos = extParameterInfo.empty() ? nullptr : extParameterInfo.data(); + epi.AArch64SMEAttributes = AArch64SMEAttributes; return ctx.getFunctionType(returnType, parameters, epi); }]>; } diff --git a/clang/include/clang/Basic/AArch64SVEACLETypes.def b/clang/include/clang/Basic/AArch64SVEACLETypes.def --- a/clang/include/clang/Basic/AArch64SVEACLETypes.def +++ b/clang/include/clang/Basic/AArch64SVEACLETypes.def @@ -49,6 +49,11 @@ SVE_TYPE(Name, Id, SingletonId) #endif +#ifndef SVE_OPAQUE_TYPE +#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId) \ + SVE_TYPE(Name, Id, SingletonId) +#endif + //===- Vector point types -----------------------------------------------===// @@ -124,7 +129,12 @@ SVE_VECTOR_TYPE("__clang_svbfloat16x4_t", "svbfloat16x4_t", SveBFloat16x4, SveBFloat16x4Ty, 32, 16, true, false, true) SVE_PREDICATE_TYPE("__SVBool_t", "__SVBool_t", SveBool, SveBoolTy, 16) +SVE_PREDICATE_TYPE("__clang_svboolx2_t", "svboolx2_t", SveBoolx2, SveBoolx2Ty, 32) +SVE_PREDICATE_TYPE("__clang_svboolx4_t", "svboolx4_t", SveBoolx4, SveBoolx4Ty, 64) + +SVE_OPAQUE_TYPE("__SVCount_t", "__SVCount_t", SveCount, SveCountTy) #undef SVE_VECTOR_TYPE #undef SVE_PREDICATE_TYPE +#undef SVE_OPAQUE_TYPE #undef SVE_TYPE diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -397,6 +397,9 @@ } def TargetARM : TargetArch<["arm", "thumb", "armeb", "thumbeb"]>; def TargetAArch64 : TargetArch<["aarch64"]>; +def TargetAArch64SME : TargetArch<["aarch64"]> { + let CustomCode = [{ Target.hasFeature("sme") }]; +} def TargetAnyArm : TargetArch; def TargetAVR : TargetArch<["avr"]>; def TargetBPF : TargetArch<["bpfel", "bpfeb"]>; @@ -2384,6 +2387,42 @@ let Documentation = [AArch64VectorPcsDocs]; } +def ArmStreamingCompatible : TypeAttr, TargetSpecificAttr { + let Spellings = [GNU<"arm_streaming_compatible">]; + let Subjects = SubjectList<[HasFunctionProto], ErrorDiag>; + let Documentation = [ArmSmeStreamingCompatibleDocs]; +} + +def ArmStreaming : TypeAttr, TargetSpecificAttr { + let Spellings = [GNU<"arm_streaming">]; + let Subjects = SubjectList<[HasFunctionProto], ErrorDiag>; + let Documentation = [ArmSmeStreamingDocs]; +} + +def ArmLocallyStreaming : InheritableAttr, TargetSpecificAttr { + let Spellings = [GNU<"arm_locally_streaming">]; + let Subjects = SubjectList<[Function], ErrorDiag>; + let Documentation = [ArmSmeLocallyStreamingDocs]; +} + +def ArmSharedZA : TypeAttr, TargetSpecificAttr { + let Spellings = [GNU<"arm_shared_za">]; + let Subjects = SubjectList<[HasFunctionProto], ErrorDiag>; + let Documentation = [ArmSmeSharedZADocs]; +} + +def ArmPreservesZA : TypeAttr, TargetSpecificAttr { + let Spellings = [GNU<"arm_preserves_za">]; + let Subjects = SubjectList<[HasFunctionProto], ErrorDiag>; + let Documentation = [ArmSmePreservesZADocs]; +} + +def ArmNewZA : TypeAttr, TargetSpecificAttr { + let Spellings = [GNU<"arm_new_za">]; + let Subjects = SubjectList<[HasFunctionProto], ErrorDiag>; + let Documentation = [ArmSmeNewZADocs]; +} + def AArch64SVEPcs: DeclOrTypeAttr { let Spellings = [Clang<"aarch64_sve_pcs">]; let Documentation = [AArch64SVEPcsDocs]; diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -6314,6 +6314,88 @@ }]; } +def ArmSmeStreamingDocs : Documentation { + let Category = DocCatType; + let Content = [{ +The ``arm_streaming`` attribute is defined by the Arm C Language Extensions +(ACLE) for SME. It is used to mark a function as being a streaming function for +which ``PSTATE.SM`` must be ``1`` on entry and on exit of the function. + +By adding this attribute, Clang will insert the appropriate ``smstart`` and +``smstop`` instructions before and after the call to guarantee that these +conditions are satisfied. + }]; +} + +def ArmSmeLocallyStreamingDocs : Documentation { + let Category = DocCatFunction; + let Content = [{ +The ``arm_locally_streaming`` attribute is defined by the Arm C Language Extensions +(ACLE) for SME. It is used to mark a function's body (not the interface) as requiring +``PSTATE.SM`` to be ``1``, although the function is expected to be called with +``PSTATE.SM=0`` and return with ``PSTATE.SM`` unchanged. + +By adding this attribute, Clang will insert the appropriate ``smstart`` and +``smstop`` instructions in the prologue and epilogue of the function. + }]; +} + +def ArmSmeStreamingCompatibleDocs : Documentation { + let Category = DocCatType; + let Content = [{ +The ``arm_streaming_compatible`` attribute is defined by the Arm C Language +Extensions (ACLE) for SME. It is used to mark a function as being a streaming +compatible function for which ``PSTATE.SM`` can either be ``0`` or ``1`` at +runtime. Additionally, the ABI specifies that the value of ``PSTATE.SM`` is +passed in register ``X0``. + +By adding this attribute, Clang will pass an implicit parameter with the value +of ``PSTATE.SM`` in ``X0`` to streaming-compatible functions and will insert the +appropriate ``smstart`` and ``smstop`` instructions when there are calls to +other functions that are not streaming compatible. + +Clang will also avoid generating instructions that are illegal in either +streaming mode or normal mode. + }]; +} + +def ArmSmeSharedZADocs : Documentation { + let Category = DocCatType; + let Content = [{ +The ``arm_shared_za`` attribute is defined by the Arm C Language Extensions +(ACLE) for SME. It is used to mark a function as sharing the state of ZA, the +acculator array, with that of it's callers. + +By adding this attribute, callers of this function will know that the contents +of ZA may be used for passing or returning data, and can be modified. Clang may +assume that ``PSTATE.ZA`` is ``1`` and will avoid setting up a lazy-save +mechanism for calls to functions marked as ``arm_shared_za``. + }]; +} + +def ArmSmeNewZADocs : Documentation { + let Category = DocCatType; + let Content = [{ +The ``arm_new_za`` attribute is defined by the Arm C Language Extensions (ACLE) +for SME. It is used to mark a function as a private ZA function that requires a +new state for ZA. + +By adding this attribute, Clang emits the appropriate ``smstart`` instruction to +allow the use of ZA and will additionally commit a lazy-save if the state of ZA +is dormant. It also emits the appropriate ``smstop`` in the function's epilogue. + }]; +} + +def ArmSmePreservesZADocs : Documentation { + let Category = DocCatType; + let Content = [{ +The ``arm_preserves_za`` attribute is defined by the Arm C Language Extensions +(ACLE) for SME. If a function is marked as ``arm_preserves_za``, it is a hint to +the compiler that the function and any of it's callees will preserve the state +of ZA. + }]; +} + def ArmMveStrictPolymorphismDocs : Documentation { let Category = DocCatType; let Content = [{ diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def --- a/clang/include/clang/Basic/Builtins.def +++ b/clang/include/clang/Basic/Builtins.def @@ -39,6 +39,7 @@ // A -> "reference" to __builtin_va_list // V -> Vector, followed by the number of elements and the base type. // q -> Scalable vector, followed by the number of elements and the base type. +// Q -> AArch64 svcount_t builtin type. // E -> ext_vector, followed by the number of elements and the base type. // X -> _Complex, followed by the base type. // Y -> ptrdiff_t diff --git a/clang/include/clang/Basic/BuiltinsSME.def b/clang/include/clang/Basic/BuiltinsSME.def new file mode 100644 --- /dev/null +++ b/clang/include/clang/Basic/BuiltinsSME.def @@ -0,0 +1,20 @@ +//===--- BuiltinsSVE.def - SVE Builtin function database --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the SVE-specific builtin function database. Users of +// this file must define the BUILTIN macro to make use of this information. +// +//===----------------------------------------------------------------------===// + +// The format of this database matches clang/Basic/Builtins.def. + +#define GET_SME_BUILTINS +#include "clang/Basic/arm_sme_builtins.inc" +#undef GET_SME_BUILTINS + +#undef BUILTIN diff --git a/clang/include/clang/Basic/CMakeLists.txt b/clang/include/clang/Basic/CMakeLists.txt --- a/clang/include/clang/Basic/CMakeLists.txt +++ b/clang/include/clang/Basic/CMakeLists.txt @@ -72,6 +72,24 @@ clang_tablegen(arm_sve_sema_rangechecks.inc -gen-arm-sve-sema-rangechecks SOURCE arm_sve.td TARGET ClangARMSveSemaRangeChecks) +clang_tablegen(arm_sve_streaming_attrs.inc -gen-arm-sve-streaming-attrs + SOURCE arm_sve.td + TARGET ClangARMSveStreamingAttrs) +clang_tablegen(arm_sme_builtins.inc -gen-arm-sme-builtins + SOURCE arm_sme.td + TARGET ClangARMSmeBuiltins) +clang_tablegen(arm_sme_builtin_cg.inc -gen-arm-sme-builtin-codegen + SOURCE arm_sme.td + TARGET ClangARMSmeBuiltinCG) +clang_tablegen(arm_sme_typeflags.inc -gen-arm-sme-typeflags + SOURCE arm_sme.td + TARGET ClangARMSmeTypeFlags) +clang_tablegen(arm_sme_sema_rangechecks.inc -gen-arm-sme-sema-rangechecks + SOURCE arm_sme.td + TARGET ClangARMSmeSemaRangeChecks) +clang_tablegen(arm_sme_streaming_attrs.inc -gen-arm-sme-streaming-attrs + SOURCE arm_sme.td + TARGET ClangARMSmeStreamingAttrs) clang_tablegen(arm_cde_builtins.inc -gen-arm-cde-builtin-def SOURCE arm_cde.td TARGET ClangARMCdeBuiltinsDef) diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -2003,6 +2003,10 @@ "than the function it overrides}1,2">; def note_overridden_virtual_function : Note< "overridden virtual function is here">; +def err_conflicting_overriding_attributes : Error< + "virtual function %0 has different attributes " + "%diff{($) than the function it overrides (which has $)|" + "than the function it overrides}1,2">; def err_conflicting_overriding_cc_attributes : Error< "virtual function %0 has different calling convention attributes " "%diff{($) than the function it overrides (which has calling convention $)|" @@ -3039,6 +3043,9 @@ def err_attribute_arm_feature_sve_bits_unsupported : Error< "%0 is only supported when '-msve-vector-bits=' is specified with a " "value of 128, 256, 512, 1024 or 2048.">; +def warn_attribute_arm_sm_incompat_builtin : Warning< + "builtin call has undefined behaviour when called from a %0 function">, + InGroup>; def err_attribute_requires_positive_integer : Error< "%0 attribute requires a %select{positive|non-negative}1 " "integral compile time constant expression">; @@ -3546,6 +3553,9 @@ "the vecreturn attribute can only be used on a class or structure with one member, which must be a vector">; def err_attribute_vecreturn_only_pod_record : Error< "the vecreturn attribute can only be used on a POD (plain old data) class or structure (i.e. no virtual functions)">; +def err_sme_attr_mismatch : Error< + "function declared '%0' was previously declared '%1'" + " with different SME function attributes">; def err_cconv_change : Error< "function declared '%0' here was previously declared " "%select{'%2'|without calling convention}1">; diff --git a/clang/include/clang/Basic/TargetBuiltins.h b/clang/include/clang/Basic/TargetBuiltins.h --- a/clang/include/clang/Basic/TargetBuiltins.h +++ b/clang/include/clang/Basic/TargetBuiltins.h @@ -53,6 +53,15 @@ }; } + namespace SME { + enum { + LastSVEBuiltin = SVE::FirstTSBuiltin - 1, +#define BUILTIN(ID, TYPE, ATTRS) BI##ID, +#include "clang/Basic/BuiltinsSME.def" + FirstTSBuiltin, + }; + } // namespace SME + /// AArch64 builtins namespace AArch64 { enum { @@ -60,8 +69,10 @@ LastNEONBuiltin = NEON::FirstTSBuiltin - 1, FirstSVEBuiltin = NEON::FirstTSBuiltin, LastSVEBuiltin = SVE::FirstTSBuiltin - 1, - #define BUILTIN(ID, TYPE, ATTRS) BI##ID, - #include "clang/Basic/BuiltinsAArch64.def" + FirstSMEBuiltin = SVE::FirstTSBuiltin, + LastSMEBuiltin = SME::FirstTSBuiltin - 1, +#define BUILTIN(ID, TYPE, ATTRS) BI##ID, +#include "clang/Basic/BuiltinsAArch64.def" LastTSBuiltin }; } @@ -282,6 +293,12 @@ bool isTupleCreate() const { return Flags & IsTupleCreate; } bool isTupleGet() const { return Flags & IsTupleGet; } bool isTupleSet() const { return Flags & IsTupleSet; } + bool isReadZA() const { return Flags & IsReadZA; } + bool isWriteZA() const { return Flags & IsWriteZA; } + bool isArmInStreamingMode() const { return Flags & IsArmInStreamingMode; } + bool isZASliceBaseOffsetIntr() const { + return Flags & IsZASliceBaseOffsetIntr; + } uint64_t getBits() const { return Flags; } bool isFlagSet(uint64_t Flag) const { return Flags & Flag; } diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td new file mode 100644 --- /dev/null +++ b/clang/include/clang/Basic/arm_sme.td @@ -0,0 +1,330 @@ +//===--- arm_sme.td - ARM SME compiler interface ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the TableGen definitions from which the ARM SME header +// file will be generated. See: +// +// https://developer.arm.com/architectures/system-architectures/software-standards/acle +// +//===----------------------------------------------------------------------===// + +include "arm_sve_common.td" + +class Inst ft, list ch, MemEltType met = MemEltTyDefault> { + string Name = n; + string Prototype = p; + string Types = t; + string ArchGuard = ""; + int Merge = MergeNone.Value; + string MergeSuffix = MergeNone.Suffix; + string LLVMIntrinsic = i; + list Flags = ft; + list ImmChecks = ch; + int MemEltType = met.Value; +} + +// MBaseInst: Memory base instruction. +class MBaseInst f, + MemEltType met, list ch> + : Inst { +} + +// MLInst: Memory load instruction. +class MLInst ch> + : MBaseInst { +} + +// MSInst: Memory store instruction. +class MSInst ch> + : MBaseInst { +} + +multiclass LoadHV ch> { + def NAME # _H : MLInst<"svld1_hor_" # n, p, met, i # "_horiz", ch>; + def NAME # _V : MLInst<"svld1_ver_" # n, p, met, i # "_vert", ch>; +} + +multiclass StoreHV ch> { + def NAME # _H : MSInst<"svst1_hor_" # n, p, met, i # "_horiz", ch>; + def NAME # _V : MSInst<"svst1_ver_" # n, p, met, i # "_vert", ch>; +} + +class ReadInst ch> + : Inst { +} + +class WriteInst ch> + : Inst { +} + +multiclass ReadHV ch> { + def NAME # _H : ReadInst<"svread_hor_" # n, p, t, i # "_horiz", ch>; + def NAME # _V : ReadInst<"svread_ver_" # n, p, t, i # "_vert", ch>; +} + +multiclass WriteHV ch> { + def NAME # _H : WriteInst<"svwrite_hor_" # n, p, t, i # "_horiz", ch>; + def NAME # _V : WriteInst<"svwrite_ver_" # n, p, t, i # "_vert", ch>; +} + +multiclass AddHV ch> { + def NAME # _H : Inst<"svaddha_" # n, p, t, i # "ha", [IsStreaming, IsSharedZA], ch>; + def NAME # _V : Inst<"svaddva_" # n, p, t, i # "va", [IsStreaming, IsSharedZA], ch>; +} + +multiclass MopAS ch> { + def NAME # _A : Inst<"svmopa_" # n, "viPPdd", t, "aarch64_sme_" # i # "a" # w, [IsStreaming, IsSharedZA], ch>; + def NAME # _S : Inst<"svmops_" # n, "viPPdd", t, "aarch64_sme_" # i # "s" # w, [IsStreaming, IsSharedZA], ch>; +} + +multiclass MixedSignMopAS ch> { + def NAME # _A : Inst; + def NAME # _S : Inst; +} + +// == LOADS == + +defm SVLD1_ZA8 : LoadHV<"za8", "vimiPQ", MemEltTyInt8, "aarch64_sme_ld1b", [ImmCheck<0, ImmCheck0>, ImmCheck<2, ImmCheck0_15>]>; +defm SVLD1_ZA16 : LoadHV<"za16", "vimiPQ", MemEltTyInt16, "aarch64_sme_ld1h", [ImmCheck<0, ImmCheck0_1>, ImmCheck<2, ImmCheck0_7>]>; +defm SVLD1_ZA32 : LoadHV<"za32", "vimiPQ", MemEltTyInt32, "aarch64_sme_ld1w", [ImmCheck<0, ImmCheck0_3>, ImmCheck<2, ImmCheck0_3>]>; +defm SVLD1_ZA64 : LoadHV<"za64", "vimiPQ", MemEltTyInt64, "aarch64_sme_ld1d", [ImmCheck<0, ImmCheck0_7>, ImmCheck<2, ImmCheck0_1>]>; +defm SVLD1_ZA128 : LoadHV<"za128", "vimiPQ", MemEltTyInt128, "aarch64_sme_ld1q", [ImmCheck<0, ImmCheck0_15>, ImmCheck<2, ImmCheck0>]>; + +defm SVLD1_VNUM_ZA8 : LoadHV<"vnum_za8", "vimiPQl", MemEltTyInt8, "aarch64_sme_ld1b", [ImmCheck<0, ImmCheck0>, ImmCheck<2, ImmCheck0_15>]>; +defm SVLD1_VNUM_ZA16 : LoadHV<"vnum_za16", "vimiPQl", MemEltTyInt16, "aarch64_sme_ld1h", [ImmCheck<0, ImmCheck0_1>, ImmCheck<2, ImmCheck0_7>]>; +defm SVLD1_VNUM_ZA32 : LoadHV<"vnum_za32", "vimiPQl", MemEltTyInt32, "aarch64_sme_ld1w", [ImmCheck<0, ImmCheck0_3>, ImmCheck<2, ImmCheck0_3>]>; +defm SVLD1_VNUM_ZA64 : LoadHV<"vnum_za64", "vimiPQl", MemEltTyInt64, "aarch64_sme_ld1d", [ImmCheck<0, ImmCheck0_7>, ImmCheck<2, ImmCheck0_1>]>; +defm SVLD1_VNUM_ZA128 : LoadHV<"vnum_za128", "vimiPQl", MemEltTyInt128, "aarch64_sme_ld1q", [ImmCheck<0, ImmCheck0_15>, ImmCheck<2, ImmCheck0>]>; + +def SVLDR_VNUM_ZA : MLInst<"svldr_vnum_za", "vmiPQ", MemEltTyInt8, "aarch64_sme_ldr", [ImmCheck<1, ImmCheck0_15>]>; + +// == STORES == + +defm SVST1_ZA8 : StoreHV<"za8", "vimiP{", MemEltTyInt8, "aarch64_sme_st1b", [ImmCheck<0, ImmCheck0>, ImmCheck<2, ImmCheck0_15>]>; +defm SVST1_ZA16 : StoreHV<"za16", "vimiP{", MemEltTyInt16, "aarch64_sme_st1h", [ImmCheck<0, ImmCheck0_1>, ImmCheck<2, ImmCheck0_7>]>; +defm SVST1_ZA32 : StoreHV<"za32", "vimiP{", MemEltTyInt32, "aarch64_sme_st1w", [ImmCheck<0, ImmCheck0_3>, ImmCheck<2, ImmCheck0_3>]>; +defm SVST1_ZA64 : StoreHV<"za64", "vimiP{", MemEltTyInt64, "aarch64_sme_st1d", [ImmCheck<0, ImmCheck0_7>, ImmCheck<2, ImmCheck0_1>]>; +defm SVST1_ZA128 : StoreHV<"za128", "vimiP{", MemEltTyInt128, "aarch64_sme_st1q", [ImmCheck<0, ImmCheck0_15>, ImmCheck<2, ImmCheck0>]>; + +defm SVST1_VNUM_ZA8 : StoreHV<"vnum_za8", "vimiP{l", MemEltTyInt8, "aarch64_sme_st1b", [ImmCheck<0, ImmCheck0>, ImmCheck<2, ImmCheck0_15>]>; +defm SVST1_VNUM_ZA16 : StoreHV<"vnum_za16", "vimiP{l", MemEltTyInt16, "aarch64_sme_st1h", [ImmCheck<0, ImmCheck0_1>, ImmCheck<2, ImmCheck0_7>]>; +defm SVST1_VNUM_ZA32 : StoreHV<"vnum_za32", "vimiP{l", MemEltTyInt32, "aarch64_sme_st1w", [ImmCheck<0, ImmCheck0_3>, ImmCheck<2, ImmCheck0_3>]>; +defm SVST1_VNUM_ZA64 : StoreHV<"vnum_za64", "vimiP{l", MemEltTyInt64, "aarch64_sme_st1d", [ImmCheck<0, ImmCheck0_7>, ImmCheck<2, ImmCheck0_1>]>; +defm SVST1_VNUM_ZA128 : StoreHV<"vnum_za128", "vimiP{l", MemEltTyInt128, "aarch64_sme_st1q", [ImmCheck<0, ImmCheck0_15>, ImmCheck<2, ImmCheck0>]>; + +def SVSTR_VNUM_ZA : MSInst<"svstr_vnum_za", "vmiP{", MemEltTyInt8, "aarch64_sme_str", [ImmCheck<1, ImmCheck0_15>]>; + +// == MOVA/READS == + +defm SVREAD_ZA8 : ReadHV<"za8_{d}_m", "ddPimi", "cUc", "aarch64_sme_read", [ImmCheck<2, ImmCheck0>, ImmCheck<4, ImmCheck0_15>]>; +defm SVREAD_ZA16 : ReadHV<"za16_{d}_m", "ddPimi", "sUshb", "aarch64_sme_read", [ImmCheck<2, ImmCheck0_1>, ImmCheck<4, ImmCheck0_7>]>; +defm SVREAD_ZA32 : ReadHV<"za32_{d}_m", "ddPimi", "iUif", "aarch64_sme_read", [ImmCheck<2, ImmCheck0_3>, ImmCheck<4, ImmCheck0_3>]>; +defm SVREAD_ZA64 : ReadHV<"za64_{d}_m", "ddPimi", "lUld", "aarch64_sme_read", [ImmCheck<2, ImmCheck0_7>, ImmCheck<4, ImmCheck0_1>]>; +defm SVREAD_ZA128 : ReadHV<"za128_{d}_m", "ddPimi", "cUcsUshbiUiflUld", "aarch64_sme_readq", [ImmCheck<2, ImmCheck0_15>, ImmCheck<4, ImmCheck0>]>; + +// == MOVA/WRITES == + +defm SVWRITE_ZA8 : WriteHV<"za8_{d}_m", "vimiPd", "cUc", "aarch64_sme_write", [ImmCheck<0, ImmCheck0>, ImmCheck<2, ImmCheck0_15>]>; +defm SVWRITE_ZA16 : WriteHV<"za16_{d}_m", "vimiPd", "sUshb", "aarch64_sme_write", [ImmCheck<0, ImmCheck0_1>, ImmCheck<2, ImmCheck0_7>]>; +defm SVWRITE_ZA32 : WriteHV<"za32_{d}_m", "vimiPd", "iUif", "aarch64_sme_write", [ImmCheck<0, ImmCheck0_3>, ImmCheck<2, ImmCheck0_3>]>; +defm SVWRITE_ZA64 : WriteHV<"za64_{d}_m", "vimiPd", "lUld", "aarch64_sme_write", [ImmCheck<0, ImmCheck0_7>, ImmCheck<2, ImmCheck0_1>]>; +defm SVWRITE_ZA128 : WriteHV<"za128_{d}_m", "vimiPd", "cUcsUshbiUiflUld", "aarch64_sme_writeq", [ImmCheck<0, ImmCheck0_15>, ImmCheck<2, ImmCheck0>]>; + +// == SVCNT == + +def SVCNTSB : Inst<"svcntsb", "nv", "", "aarch64_sme_cntsb", [IsOverloadNone, IsStreamingCompatible, IsPreservedZA], []>; +def SVCNTSH : Inst<"svcntsh", "nv", "", "aarch64_sme_cntsh", [IsOverloadNone, IsStreamingCompatible, IsPreservedZA], []>; +def SVCNTSW : Inst<"svcntsw", "nv", "", "aarch64_sme_cntsw", [IsOverloadNone, IsStreamingCompatible, IsPreservedZA], []>; +def SVCNTSD : Inst<"svcntsd", "nv", "", "aarch64_sme_cntsd", [IsOverloadNone, IsStreamingCompatible, IsPreservedZA], []>; + +// == ZERO == + +def SVZERO_MASK : Inst<"svzero_mask_za", "vi", "", "aarch64_sme_zero", [IsOverloadNone, IsStreaming, IsSharedZA], [ImmCheck<0, ImmCheck0_255>]>; + +// == PSTATE FUNCTIONS == +def IN_STREAMING_MODE : Inst<"__arm_in_streaming_mode", "yv", "", "", [IsOverloadNone, IsStreamingCompatible, IsArmInStreamingMode], []>; + +// == ADDHA/ADDVA == + +defm SVADD_ZA32 : AddHV<"za32_{d}", "viPPd", "iUi", "aarch64_sme_add", [ImmCheck<0, ImmCheck0_3>]>; +let ArchGuard = "defined(__ARM_FEATURE_SME_I64I64)" in { + defm SVADD_ZA64 : AddHV<"za64_{d}", "viPPd", "lUl", "aarch64_sme_add", [ImmCheck<0, ImmCheck0_7>]>; +} + +// == MOPA / MOPS == + +defm SVMOP_WIDE_ZA32_FP : MopAS<"za32_{d}", "hb", "mop", "_wide", [ImmCheck<0, ImmCheck0_3>]>; +defm SVMOP_WIDE_ZA32_S8 : MopAS<"za32_{d}", "c", "smop", "_wide", [ImmCheck<0, ImmCheck0_3>]>; +defm SVMOP_WIDE_ZA32_U8 : MopAS<"za32_{d}", "Uc", "umop", "_wide", [ImmCheck<0, ImmCheck0_3>]>; +defm SVMOP_ZA32_FP : MopAS<"za32_{d}", "f", "mop", "", [ImmCheck<0, ImmCheck0_3>]>; + +let ArchGuard = "defined(__ARM_FEATURE_SME_I64I64)" in { + defm SVMOPA_WIDE_ZA64_S16 : MopAS<"za64_{d}", "s", "smop", "_wide", [ImmCheck<0, ImmCheck0_7>]>; + defm SVMOPA_WIDE_ZA64_U16 : MopAS<"za64_{d}", "Us", "umop", "_wide", [ImmCheck<0, ImmCheck0_7>]>; +} + +let ArchGuard = "defined(__ARM_FEATURE_SME_F64F64)" in { + defm SVMOP_ZA64_FP : MopAS<"za64_{d}", "d", "mop", "", [ImmCheck<0, ImmCheck0_7>]>; +} + +// == SUMOPA / SUMOPS / USMOPS / USMOPA == + +defm SVSUMOP_ZA32_S8 : MixedSignMopAS<"svsumop", "za32_{d}", "c", "aarch64_sme_sumop", [ImmCheck<0, ImmCheck0_3>]>; +let ArchGuard = "defined(__ARM_FEATURE_SME_I64I64)" in { + defm SVSUMOP_ZA64_S16 : MixedSignMopAS<"svsumop", "za64_{d}", "s", "aarch64_sme_sumop", [ImmCheck<0, ImmCheck0_7>]>; +} + +defm SVUSMOP_ZA32_U8 : MixedSignMopAS<"svusmop", "za32_{d}", "Uc", "aarch64_sme_usmop", [ImmCheck<0, ImmCheck0_3>]>; +let ArchGuard = "defined(__ARM_FEATURE_SME_I64I64)" in { + defm SVUSMOP_ZA64_U16 : MixedSignMopAS<"svusmop", "za64_{d}", "Us", "aarch64_sme_usmop", [ImmCheck<0, ImmCheck0_7>]>; +} + +// FMLA/FMLS +let ArchGuard = "defined(__ARM_FEATURE_SME2)" in { + def SVMLA_MULTI_VG1x2_F32 : Inst<"svmla_za32[_{d}]_vg1x2", "vmi22", "f", "aarch64_sme_fmla_multi_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLA_MULTI_VG1x4_F32 : Inst<"svmla_za32[_{d}]_vg1x4", "vmi44", "f", "aarch64_sme_fmla_multi_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLS_MULTI_VG1x2_F32 : Inst<"svmls_za32[_{d}]_vg1x2", "vmi22", "f", "aarch64_sme_fmls_multi_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLS_MULTI_VG1x4_F32 : Inst<"svmls_za32[_{d}]_vg1x4", "vmi44", "f", "aarch64_sme_fmls_multi_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + def SVMLA_SINGLE_VG1x2_F32 : Inst<"svmla[_single]_za32[_{d}]_vg1x2", "vmi2d", "f", "aarch64_sme_fmla_single_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLA_SINGLE_VG1x4_F32 : Inst<"svmla[_single]_za32[_{d}]_vg1x4", "vmi4d", "f", "aarch64_sme_fmla_single_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLS_SINGLE_VG1x2_F32 : Inst<"svmls[_single]_za32[_{d}]_vg1x2", "vmi2d", "f", "aarch64_sme_fmls_single_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLS_SINGLE_VG1x4_F32 : Inst<"svmls[_single]_za32[_{d}]_vg1x4", "vmi4d", "f", "aarch64_sme_fmls_single_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + def SVMLA_LANE_VG1x2_F32 : Inst<"svmla_lane_za32[_{d}]_vg1x2", "vmi2di", "f", "aarch64_sme_fmla_lane_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + def SVMLA_LANE_VG1x4_F32 : Inst<"svmla_lane_za32[_{d}]_vg1x4", "vmi4di", "f", "aarch64_sme_fmla_lane_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + def SVMLS_LANE_VG1x2_F32 : Inst<"svmls_lane_za32[_{d}]_vg1x2", "vmi2di", "f", "aarch64_sme_fmls_lane_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + def SVMLS_LANE_VG1x4_F32 : Inst<"svmls_lane_za32[_{d}]_vg1x4", "vmi4di", "f", "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; +} + +let ArchGuard = "defined(__ARM_FEATURE_SME2) && defined(__ARM_FEATURE_SME_F64F64)" in { + def SVMLA_MULTI_VG1x2_F64 : Inst<"svmla_za64[_{d}]_vg1x2", "vmi22", "d", "aarch64_sme_fmla_multi_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLA_MULTI_VG1x4_F64 : Inst<"svmla_za64[_{d}]_vg1x4", "vmi44", "d", "aarch64_sme_fmla_multi_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLS_MULTI_VG1x2_F64 : Inst<"svmls_za64[_{d}]_vg1x2", "vmi22", "d", "aarch64_sme_fmls_multi_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLS_MULTI_VG1x4_F64 : Inst<"svmls_za64[_{d}]_vg1x4", "vmi44", "d", "aarch64_sme_fmls_multi_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + def SVMLA_SINGLE_VG1x2_F64 : Inst<"svmla[_single]_za64[_{d}]_vg1x2", "vmi2d", "d", "aarch64_sme_fmla_single_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLA_SINGLE_VG1x4_F64 : Inst<"svmla[_single]_za64[_{d}]_vg1x4", "vmi4d", "d", "aarch64_sme_fmla_single_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLS_SINGLE_VG1x2_F64 : Inst<"svmls[_single]_za64[_{d}]_vg1x2", "vmi2d", "d", "aarch64_sme_fmls_single_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLS_SINGLE_VG1x4_F64 : Inst<"svmls[_single]_za64[_{d}]_vg1x4", "vmi4d", "d", "aarch64_sme_fmls_single_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + def SVMLA_LANE_VG1x2_F64 : Inst<"svmla_lane_za64[_{d}]_vg1x2", "vmi2di", "d", "aarch64_sme_fmla_lane_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_1>]>; + def SVMLA_LANE_VG1x4_F64 : Inst<"svmla_lane_za64[_{d}]_vg1x4", "vmi4di", "d", "aarch64_sme_fmla_lane_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_1>]>; + def SVMLS_LANE_VG1x2_F64 : Inst<"svmls_lane_za64[_{d}]_vg1x2", "vmi2di", "d", "aarch64_sme_fmls_lane_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_1>]>; + def SVMLS_LANE_VG1x4_F64 : Inst<"svmls_lane_za64[_{d}]_vg1x4", "vmi4di", "d", "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_1>]>; +} + +// FMLAL/FMLSL/UMLAL/SMLAL +let ArchGuard = "defined(__ARM_FEATURE_SME2)" in { + // MULTI MLAL + def SVMLAL_MULTI_VG2x2_F16 : Inst<"svmlal_za32[_{d}]_vg2x2", "vmi22", "bh", "aarch64_sme_fmlal_multi_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLAL_MULTI_VG2x4_F16 : Inst<"svmlal_za32[_{d}]_vg2x4", "vmi44", "bh", "aarch64_sme_fmlal_multi_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLAL_MULTI_VG2x2_S16 : Inst<"svmlal_za32[_{d}]_vg2x2", "vmi22", "s", "aarch64_sme_smlal_multi_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLAL_MULTI_VG2x4_S16 : Inst<"svmlal_za32[_{d}]_vg2x4", "vmi44", "s", "aarch64_sme_smlal_multi_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLAL_MULTI_VG2x2_U16 : Inst<"svmlal_za32[_{d}]_vg2x2", "vmi22", "Us", "aarch64_sme_umlal_multi_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLAL_MULTI_VG2x4_U16 : Inst<"svmlal_za32[_{d}]_vg2x4", "vmi44", "Us", "aarch64_sme_umlal_multi_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + + // MULTI MLSL + def SVMLSL_MULTI_VG2x2_F16 : Inst<"svmlsl_za32[_{d}]_vg2x2", "vmi22", "bh", "aarch64_sme_fmlsl_multi_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLSL_MULTI_VG2x4_F16 : Inst<"svmlsl_za32[_{d}]_vg2x4", "vmi44", "bh", "aarch64_sme_fmlsl_multi_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLSL_MULTI_VG2x2_S16 : Inst<"svmlsl_za32[_{d}]_vg2x2", "vmi22", "s", "aarch64_sme_smlsl_multi_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLSL_MULTI_VG2x4_S16 : Inst<"svmlsl_za32[_{d}]_vg2x4", "vmi44", "s", "aarch64_sme_smlsl_multi_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLSL_MULTI_VG2x2_U16 : Inst<"svmlsl_za32[_{d}]_vg2x2", "vmi22", "Us", "aarch64_sme_umlsl_multi_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLSL_MULTI_VG2x4_U16 : Inst<"svmlsl_za32[_{d}]_vg2x4", "vmi44", "Us", "aarch64_sme_umlsl_multi_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + + // SINGLE MLAL + def SVMLAL_SINGLE_VG2x1_F16 : Inst<"svmlal[_single]_za32[_{d}]_vg2x1", "vmidd", "bh", "aarch64_sme_fmlal_single_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>]>; + def SVMLAL_SINGLE_VG2x2_F16 : Inst<"svmlal[_single]_za32[_{d}]_vg2x2", "vmi2d", "bh", "aarch64_sme_fmlal_single_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLAL_SINGLE_VG2x4_F16 : Inst<"svmlal[_single]_za32[_{d}]_vg2x4", "vmi4d", "bh", "aarch64_sme_fmlal_single_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLAL_SINGLE_VG2x1_S16 : Inst<"svmlal[_single]_za32[_{d}]_vg2x1", "vmidd", "s", "aarch64_sme_smlal_single_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>]>; + def SVMLAL_SINGLE_VG2x2_S16 : Inst<"svmlal[_single]_za32[_{d}]_vg2x2", "vmi2d", "s", "aarch64_sme_smlal_single_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLAL_SINGLE_VG2x4_S16 : Inst<"svmlal[_single]_za32[_{d}]_vg2x4", "vmi4d", "s", "aarch64_sme_smlal_single_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLAL_SINGLE_VG2x1_U16 : Inst<"svmlal[_single]_za32[_{d}]_vg2x1", "vmidd", "Us", "aarch64_sme_umlal_single_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>]>; + def SVMLAL_SINGLE_VG2x2_U16 : Inst<"svmlal[_single]_za32[_{d}]_vg2x2", "vmi2d", "Us", "aarch64_sme_umlal_single_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLAL_SINGLE_VG2x4_U16 : Inst<"svmlal[_single]_za32[_{d}]_vg2x4", "vmi4d", "Us", "aarch64_sme_umlal_single_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + + // SINGLE MLSL + def SVMLSL_SINGLE_VG2x1_F16 : Inst<"svmlsl[_single]_za32[_{d}]_vg2x1", "vmidd", "bh", "aarch64_sme_fmlsl_single_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>]>; + def SVMLSL_SINGLE_VG2x2_F16 : Inst<"svmlsl[_single]_za32[_{d}]_vg2x2", "vmi2d", "bh", "aarch64_sme_fmlsl_single_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLSL_SINGLE_VG2x4_F16 : Inst<"svmlsl[_single]_za32[_{d}]_vg2x4", "vmi4d", "bh", "aarch64_sme_fmlsl_single_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLSL_SINGLE_VG2x1_S16 : Inst<"svmlsl[_single]_za32[_{d}]_vg2x1", "vmidd", "s", "aarch64_sme_smlsl_single_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>]>; + def SVMLSL_SINGLE_VG2x2_S16 : Inst<"svmlsl[_single]_za32[_{d}]_vg2x2", "vmi2d", "s", "aarch64_sme_smlsl_single_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLSL_SINGLE_VG2x4_S16 : Inst<"svmlsl[_single]_za32[_{d}]_vg2x4", "vmi4d", "s", "aarch64_sme_smlsl_single_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLSL_SINGLE_VG2x1_U16 : Inst<"svmlsl[_single]_za32[_{d}]_vg2x1", "vmidd", "Us", "aarch64_sme_umlsl_single_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>]>; + def SVMLSL_SINGLE_VG2x2_U16 : Inst<"svmlsl[_single]_za32[_{d}]_vg2x2", "vmi2d", "Us", "aarch64_sme_umlsl_single_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLSL_SINGLE_VG2x4_U16 : Inst<"svmlsl[_single]_za32[_{d}]_vg2x4", "vmi4d", "Us", "aarch64_sme_umlsl_single_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + + // INDEXED MLAL + def SVMLAL_LANE_VG2x1_F16 : Inst<"svmlal[_lane]_za32[_{d}]_vg2x1", "vmiddi", "bh", "aarch64_sme_fmlal_lane_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLAL_LANE_VG2x2_F16 : Inst<"svmlal[_lane]_za32[_{d}]_vg2x2", "vmi2di", "bh", "aarch64_sme_fmlal_lane_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLAL_LANE_VG2x4_F16 : Inst<"svmlal[_lane]_za32[_{d}]_vg2x4", "vmi4di", "bh", "aarch64_sme_fmlal_lane_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLAL_LANE_VG2x1_S16 : Inst<"svmlal[_lane]_za32[_{d}]_vg2x1", "vmiddi", "s", "aarch64_sme_smlal_lane_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLAL_LANE_VG2x2_S16 : Inst<"svmlal[_lane]_za32[_{d}]_vg2x2", "vmi2di", "s", "aarch64_sme_smlal_lane_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLAL_LANE_VG2x4_S16 : Inst<"svmlal[_lane]_za32[_{d}]_vg2x4", "vmi4di", "s", "aarch64_sme_smlal_lane_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLAL_LANE_VG2x1_U16 : Inst<"svmlal[_lane]_za32[_{d}]_vg2x1", "vmiddi", "Us", "aarch64_sme_umlal_lane_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLAL_LANE_VG2x2_U16 : Inst<"svmlal[_lane]_za32[_{d}]_vg2x2", "vmi2di", "Us", "aarch64_sme_umlal_lane_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLAL_LANE_VG2x4_U16 : Inst<"svmlal[_lane]_za32[_{d}]_vg2x4", "vmi4di", "Us", "aarch64_sme_umlal_lane_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + + // INDEXED MLSL + def SVMLSL_LANE_VG2x1_F16 : Inst<"svmlsl[_lane]_za32[_{d}]_vg2x1", "vmiddi", "bh", "aarch64_sme_fmlsl_lane_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLSL_LANE_VG2x2_F16 : Inst<"svmlsl[_lane]_za32[_{d}]_vg2x2", "vmi2di", "bh", "aarch64_sme_fmlsl_lane_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLSL_LANE_VG2x4_F16 : Inst<"svmlsl[_lane]_za32[_{d}]_vg2x4", "vmi4di", "bh", "aarch64_sme_fmlsl_lane_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLSL_LANE_VG2x1_S16 : Inst<"svmlsl[_lane]_za32[_{d}]_vg2x1", "vmiddi", "s", "aarch64_sme_smlsl_lane_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLSL_LANE_VG2x2_S16 : Inst<"svmlsl[_lane]_za32[_{d}]_vg2x2", "vmi2di", "s", "aarch64_sme_smlsl_lane_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLSL_LANE_VG2x4_S16 : Inst<"svmlsl[_lane]_za32[_{d}]_vg2x4", "vmi4di", "s", "aarch64_sme_smlsl_lane_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLSL_LANE_VG2x1_U16 : Inst<"svmlsl[_lane]_za32[_{d}]_vg2x1", "vmiddi", "Us", "aarch64_sme_umlsl_lane_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLSL_LANE_VG2x2_U16 : Inst<"svmlsl[_lane]_za32[_{d}]_vg2x2", "vmi2di", "Us", "aarch64_sme_umlsl_lane_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLSL_LANE_VG2x4_U16 : Inst<"svmlsl[_lane]_za32[_{d}]_vg2x4", "vmi4di", "Us", "aarch64_sme_umlsl_lane_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; +} + +// +// 2 and 4 vector-group read/write intrinsics. +// + +multiclass WriteHV_VG checkvg2, list checkvg4> { + def NAME # _VG2_H : Inst<"svwrite_hor_" # n # "_vg2", "vmi2", t, i # "_hor_vg2", [IsSharedZA, IsStreaming, IsZASliceBaseOffsetIntr], checkvg2>; + def NAME # _VG2_V : Inst<"svwrite_ver_" # n # "_vg2", "vmi2", t, i # "_ver_vg2", [IsSharedZA, IsStreaming, IsZASliceBaseOffsetIntr], checkvg2>; + def NAME # _VG4_H : Inst<"svwrite_hor_" # n # "_vg4", "vmi4", t, i # "_hor_vg4", [IsSharedZA, IsStreaming, IsZASliceBaseOffsetIntr], checkvg4>; + def NAME # _VG4_V : Inst<"svwrite_ver_" # n # "_vg4", "vmi4", t, i # "_ver_vg4", [IsSharedZA, IsStreaming, IsZASliceBaseOffsetIntr], checkvg4>; +} + +let ArchGuard = "defined(__ARM_FEATURE_SME2)" in { + defm SVWRITE_ZA8 : WriteHV_VG<"za8[_{d}]", "cUc", "aarch64_sme_write", [ImmCheck<1, ImmCheck0_14_Mul2>], [ImmCheck<1, ImmCheck0_12_Mul4>]>; + defm SVWRITE_ZA16 : WriteHV_VG<"za16[_{d}]", "sUshb", "aarch64_sme_write", [ImmCheck<1, ImmCheck0_6_Mul2>], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + defm SVWRITE_ZA32 : WriteHV_VG<"za32[_{d}]", "iUif", "aarch64_sme_write", [ImmCheck<1, ImmCheck0_2_Mul2>], [ImmCheck<1, ImmCheck0>]>; + defm SVWRITE_ZA64 : WriteHV_VG<"za64[_{d}]", "lUld", "aarch64_sme_write", [ImmCheck<1, ImmCheck0>], [ImmCheck<1, ImmCheck0>]>; +} + +multiclass ReadHV_VG checkvg2, list checkvg4> { + def NAME # _VG2_H : Inst<"svread_hor_" # n # "_vg2", "2mi", t, i # "_hor_vg2", [IsSharedZA, IsStreaming, IsZASliceBaseOffsetIntr], checkvg2>; + def NAME # _VG2_V : Inst<"svread_ver_" # n # "_vg2", "2mi", t, i # "_ver_vg2", [IsSharedZA, IsStreaming, IsZASliceBaseOffsetIntr], checkvg2>; + def NAME # _VG4_H : Inst<"svread_hor_" # n # "_vg4", "4mi", t, i # "_hor_vg4", [IsSharedZA, IsStreaming, IsZASliceBaseOffsetIntr], checkvg4>; + def NAME # _VG4_V : Inst<"svread_ver_" # n # "_vg4", "4mi", t, i # "_ver_vg4", [IsSharedZA, IsStreaming, IsZASliceBaseOffsetIntr], checkvg4>; +} + +let ArchGuard = "defined(__ARM_FEATURE_SME2)" in { + defm SVREAD_ZA8 : ReadHV_VG<"za8_{d}", "cUc", "aarch64_sme_read", [ImmCheck<1, ImmCheck0_14_Mul2>], [ImmCheck<1, ImmCheck0_12_Mul4>]>; + defm SVREAD_ZA16 : ReadHV_VG<"za16_{d}", "sUshb", "aarch64_sme_read", [ImmCheck<1, ImmCheck0_6_Mul2>], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + defm SVREAD_ZA32 : ReadHV_VG<"za32_{d}", "iUif", "aarch64_sme_read", [ImmCheck<1, ImmCheck0_2_Mul2>], [ImmCheck<1, ImmCheck0>]>; + defm SVREAD_ZA64 : ReadHV_VG<"za64_{d}", "lUld", "aarch64_sme_read", [ImmCheck<1, ImmCheck0>], [ImmCheck<1, ImmCheck0>]>; +} + + +// +// Single vector-group read/write intrinsics. +// + +let ArchGuard = "defined(__ARM_FEATURE_SME2)" in { + def SVWRITE_ZA64_VG1x2 : Inst<"svwrite_za64[_{d}]_vg1x2", "vmi2", "lUld", "aarch64_sme_write_vg1x2", [IsSharedZA, IsStreaming, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVWRITE_ZA64_VG1x4 : Inst<"svwrite_za64[_{d}]_vg1x4", "vmi4", "lUld", "aarch64_sme_write_vg1x4", [IsSharedZA, IsStreaming, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVREAD_ZA64_VG1x2 : Inst<"svread_za64_{d}_vg1x2", "2mi", "lUld", "aarch64_sme_read_vg1x2", [IsSharedZA, IsStreaming, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVREAD_ZA64_VG1x4 : Inst<"svread_za64_{d}_vg1x4", "4mi", "lUld", "aarch64_sme_read_vg1x4", [IsSharedZA, IsStreaming, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; +} diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -13,225 +13,7 @@ // //===----------------------------------------------------------------------===// -//===----------------------------------------------------------------------===// -// Instruction definitions -//===----------------------------------------------------------------------===// -// Every intrinsic subclasses "Inst". An intrinsic has a name, a prototype and -// a sequence of typespecs. -// -// The name is the base name of the intrinsic, for example "svld1". This is -// then mangled by the tblgen backend to add type information ("svld1_s16"). -// -// A typespec is a sequence of uppercase characters (modifiers) followed by one -// lowercase character. A typespec encodes a particular "base type" of the -// intrinsic. -// -// An example typespec is "Us" - unsigned short - svuint16_t. The available -// typespec codes are given below. -// -// The string given to an Inst class is a sequence of typespecs. The intrinsic -// is instantiated for every typespec in the sequence. For example "sdUsUd". -// -// The prototype is a string that defines the return type of the intrinsic -// and the type of each argument. The return type and every argument gets a -// "modifier" that can change in some way the "base type" of the intrinsic. -// -// The modifier 'd' means "default" and does not modify the base type in any -// way. The available modifiers are given below. -// -// Typespecs -// --------- -// c: char -// s: short -// i: int -// l: long -// f: float -// h: half-float -// d: double -// b: bfloat - -// Typespec modifiers -// ------------------ -// P: boolean -// U: unsigned - -// Prototype modifiers -// ------------------- -// prototype: return (arg, arg, ...) -// -// 2,3,4: array of default vectors -// v: void -// x: vector of signed integers -// u: vector of unsigned integers -// d: default -// c: const pointer type -// P: predicate type -// s: scalar of element type -// a: scalar of element type (splat to vector type) -// R: scalar of 1/2 width element type (splat to vector type) -// r: scalar of 1/4 width element type (splat to vector type) -// @: unsigned scalar of 1/4 width element type (splat to vector type) -// e: 1/2 width unsigned elements, 2x element count -// b: 1/4 width unsigned elements, 4x element count -// h: 1/2 width elements, 2x element count -// q: 1/4 width elements, 4x element count -// o: 4x width elements, 1/4 element count -// -// w: vector of element type promoted to 64bits, vector maintains -// signedness of its element type. -// f: element type promoted to uint64_t (splat to vector type) -// j: element type promoted to 64bits (splat to vector type) -// K: element type bitcast to a signed integer (splat to vector type) -// L: element type bitcast to an unsigned integer (splat to vector type) -// -// i: constant uint64_t -// k: int32_t -// l: int64_t -// m: uint32_t -// n: uint64_t - -// t: svint32_t -// z: svuint32_t -// g: svuint64_t -// O: svfloat16_t -// M: svfloat32_t -// N: svfloat64_t - -// J: Prefetch type (sv_prfop) -// A: pointer to int8_t -// B: pointer to int16_t -// C: pointer to int32_t -// D: pointer to int64_t - -// E: pointer to uint8_t -// F: pointer to uint16_t -// G: pointer to uint32_t -// H: pointer to uint64_t - -// Q: const pointer to void - -// S: const pointer to int8_t -// T: const pointer to int16_t -// U: const pointer to int32_t -// V: const pointer to int64_t -// -// W: const pointer to uint8_t -// X: const pointer to uint16_t -// Y: const pointer to uint32_t -// Z: const pointer to uint64_t - -class MergeType { - int Value = val; - string Suffix = suffix; -} -def MergeNone : MergeType<0>; -def MergeAny : MergeType<1, "_x">; -def MergeOp1 : MergeType<2, "_m">; -def MergeZero : MergeType<3, "_z">; -def MergeAnyExp : MergeType<4, "_x">; // Use merged builtin with explicit -def MergeZeroExp : MergeType<5, "_z">; // generation of its inactive argument. - -class EltType { - int Value = val; -} -def EltTyInvalid : EltType<0>; -def EltTyInt8 : EltType<1>; -def EltTyInt16 : EltType<2>; -def EltTyInt32 : EltType<3>; -def EltTyInt64 : EltType<4>; -def EltTyFloat16 : EltType<5>; -def EltTyFloat32 : EltType<6>; -def EltTyFloat64 : EltType<7>; -def EltTyBool8 : EltType<8>; -def EltTyBool16 : EltType<9>; -def EltTyBool32 : EltType<10>; -def EltTyBool64 : EltType<11>; -def EltTyBFloat16 : EltType<12>; - -class MemEltType { - int Value = val; -} -def MemEltTyDefault : MemEltType<0>; -def MemEltTyInt8 : MemEltType<1>; -def MemEltTyInt16 : MemEltType<2>; -def MemEltTyInt32 : MemEltType<3>; -def MemEltTyInt64 : MemEltType<4>; - -class FlagType { - int Value = val; -} - -// These must be kept in sync with the flags in utils/TableGen/SveEmitter.h -// and include/clang/Basic/TargetBuiltins.h -def NoFlags : FlagType<0x00000000>; -def FirstEltType : FlagType<0x00000001>; -// : : -// : : -def EltTypeMask : FlagType<0x0000000f>; -def FirstMemEltType : FlagType<0x00000010>; -// : : -// : : -def MemEltTypeMask : FlagType<0x00000070>; -def FirstMergeTypeMask : FlagType<0x00000080>; -// : : -// : : -def MergeTypeMask : FlagType<0x00000380>; -def FirstSplatOperand : FlagType<0x00000400>; -// : : -// These flags are used to specify which scalar operand -// needs to be duplicated/splatted into a vector. -// : : -def SplatOperandMask : FlagType<0x00001C00>; -def IsLoad : FlagType<0x00002000>; -def IsStore : FlagType<0x00004000>; -def IsGatherLoad : FlagType<0x00008000>; -def IsScatterStore : FlagType<0x00010000>; -def IsStructLoad : FlagType<0x00020000>; -def IsStructStore : FlagType<0x00040000>; -def IsZExtReturn : FlagType<0x00080000>; // Return value is sign-extend by default -def IsOverloadNone : FlagType<0x00100000>; // Intrinsic does not take any overloaded types. -def IsOverloadWhile : FlagType<0x00200000>; // Use {default type, typeof(operand1)} as overloaded types. -def IsOverloadWhileRW : FlagType<0x00400000>; // Use {pred(default type), typeof(operand0)} as overloaded types. -def IsOverloadCvt : FlagType<0x00800000>; // Use {typeof(operand0), typeof(last operand)} as overloaded types. -def OverloadKindMask : FlagType<0x00E00000>; // When the masked values are all '0', the default type is used as overload type. -def IsByteIndexed : FlagType<0x01000000>; -def IsAppendSVALL : FlagType<0x02000000>; // Appends SV_ALL as the last operand. -def IsInsertOp1SVALL : FlagType<0x04000000>; // Inserts SV_ALL as the second operand. -def IsPrefetch : FlagType<0x08000000>; // Contiguous prefetches. -def IsGatherPrefetch : FlagType<0x10000000>; -def ReverseCompare : FlagType<0x20000000>; // Compare operands must be swapped. -def ReverseUSDOT : FlagType<0x40000000>; // Unsigned/signed operands must be swapped. -def IsUndef : FlagType<0x80000000>; // Codegen `undef` of given type. -def IsTupleCreate : FlagType<0x100000000>; -def IsTupleGet : FlagType<0x200000000>; -def IsTupleSet : FlagType<0x400000000>; - -// These must be kept in sync with the flags in include/clang/Basic/TargetBuiltins.h -class ImmCheckType { - int Value = val; -} -def ImmCheck0_31 : ImmCheckType<0>; // 0..31 (used for e.g. predicate patterns) -def ImmCheck1_16 : ImmCheckType<1>; // 1..16 -def ImmCheckExtract : ImmCheckType<2>; // 0..(2048/sizeinbits(elt) - 1) -def ImmCheckShiftRight : ImmCheckType<3>; // 1..sizeinbits(elt) -def ImmCheckShiftRightNarrow : ImmCheckType<4>; // 1..sizeinbits(elt)/2 -def ImmCheckShiftLeft : ImmCheckType<5>; // 0..(sizeinbits(elt) - 1) -def ImmCheck0_7 : ImmCheckType<6>; // 0..7 -def ImmCheckLaneIndex : ImmCheckType<7>; // 0..(128/(1*sizeinbits(elt)) - 1) -def ImmCheckLaneIndexCompRotate : ImmCheckType<8>; // 0..(128/(2*sizeinbits(elt)) - 1) -def ImmCheckLaneIndexDot : ImmCheckType<9>; // 0..(128/(4*sizeinbits(elt)) - 1) -def ImmCheckComplexRot90_270 : ImmCheckType<10>; // [90,270] -def ImmCheckComplexRotAll90 : ImmCheckType<11>; // [0, 90, 180,270] -def ImmCheck0_13 : ImmCheckType<12>; // 0..13 -def ImmCheck0_1 : ImmCheckType<13>; // 0..1 -def ImmCheck0_2 : ImmCheckType<14>; // 0..2 -def ImmCheck0_3 : ImmCheckType<15>; // 0..3 - -class ImmCheck { - int Arg = arg; - int EltSizeArg = eltSizeArg; - ImmCheckType Kind = kind; -} +include "arm_sve_common.td" class Inst ft, list ch, MemEltType met> { @@ -263,27 +45,27 @@ // Loads // Load one vector (scalar base) -def SVLD1 : MInst<"svld1[_{2}]", "dPc", "csilUcUsUiUlhfd", [IsLoad], MemEltTyDefault, "aarch64_sve_ld1">; -def SVLD1SB : MInst<"svld1sb_{d}", "dPS", "silUsUiUl", [IsLoad], MemEltTyInt8, "aarch64_sve_ld1">; -def SVLD1UB : MInst<"svld1ub_{d}", "dPW", "silUsUiUl", [IsLoad, IsZExtReturn], MemEltTyInt8, "aarch64_sve_ld1">; -def SVLD1SH : MInst<"svld1sh_{d}", "dPT", "ilUiUl", [IsLoad], MemEltTyInt16, "aarch64_sve_ld1">; -def SVLD1UH : MInst<"svld1uh_{d}", "dPX", "ilUiUl", [IsLoad, IsZExtReturn], MemEltTyInt16, "aarch64_sve_ld1">; -def SVLD1SW : MInst<"svld1sw_{d}", "dPU", "lUl", [IsLoad], MemEltTyInt32, "aarch64_sve_ld1">; -def SVLD1UW : MInst<"svld1uw_{d}", "dPY", "lUl", [IsLoad, IsZExtReturn], MemEltTyInt32, "aarch64_sve_ld1">; +def SVLD1 : MInst<"svld1[_{2}]", "dPc", "csilUcUsUiUlhfd", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ld1">; +def SVLD1SB : MInst<"svld1sb_{d}", "dPS", "silUsUiUl", [IsLoad, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_ld1">; +def SVLD1UB : MInst<"svld1ub_{d}", "dPW", "silUsUiUl", [IsLoad, IsZExtReturn, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_ld1">; +def SVLD1SH : MInst<"svld1sh_{d}", "dPT", "ilUiUl", [IsLoad, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_ld1">; +def SVLD1UH : MInst<"svld1uh_{d}", "dPX", "ilUiUl", [IsLoad, IsZExtReturn, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_ld1">; +def SVLD1SW : MInst<"svld1sw_{d}", "dPU", "lUl", [IsLoad, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_ld1">; +def SVLD1UW : MInst<"svld1uw_{d}", "dPY", "lUl", [IsLoad, IsZExtReturn, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_ld1">; let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in { - def SVLD1_BF : MInst<"svld1[_{2}]", "dPc", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ld1">; - def SVLD1_VNUM_BF : MInst<"svld1_vnum[_{2}]", "dPcl", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ld1">; + def SVLD1_BF : MInst<"svld1[_{2}]", "dPc", "b", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ld1">; + def SVLD1_VNUM_BF : MInst<"svld1_vnum[_{2}]", "dPcl", "b", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ld1">; } // Load one vector (scalar base, VL displacement) -def SVLD1_VNUM : MInst<"svld1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfd", [IsLoad], MemEltTyDefault, "aarch64_sve_ld1">; -def SVLD1SB_VNUM : MInst<"svld1sb_vnum_{d}", "dPSl", "silUsUiUl", [IsLoad], MemEltTyInt8, "aarch64_sve_ld1">; -def SVLD1UB_VNUM : MInst<"svld1ub_vnum_{d}", "dPWl", "silUsUiUl", [IsLoad, IsZExtReturn], MemEltTyInt8, "aarch64_sve_ld1">; -def SVLD1SH_VNUM : MInst<"svld1sh_vnum_{d}", "dPTl", "ilUiUl", [IsLoad], MemEltTyInt16, "aarch64_sve_ld1">; -def SVLD1UH_VNUM : MInst<"svld1uh_vnum_{d}", "dPXl", "ilUiUl", [IsLoad, IsZExtReturn], MemEltTyInt16, "aarch64_sve_ld1">; -def SVLD1SW_VNUM : MInst<"svld1sw_vnum_{d}", "dPUl", "lUl", [IsLoad], MemEltTyInt32, "aarch64_sve_ld1">; -def SVLD1UW_VNUM : MInst<"svld1uw_vnum_{d}", "dPYl", "lUl", [IsLoad, IsZExtReturn], MemEltTyInt32, "aarch64_sve_ld1">; +def SVLD1_VNUM : MInst<"svld1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfd", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ld1">; +def SVLD1SB_VNUM : MInst<"svld1sb_vnum_{d}", "dPSl", "silUsUiUl", [IsLoad, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_ld1">; +def SVLD1UB_VNUM : MInst<"svld1ub_vnum_{d}", "dPWl", "silUsUiUl", [IsLoad, IsZExtReturn, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_ld1">; +def SVLD1SH_VNUM : MInst<"svld1sh_vnum_{d}", "dPTl", "ilUiUl", [IsLoad, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_ld1">; +def SVLD1UH_VNUM : MInst<"svld1uh_vnum_{d}", "dPXl", "ilUiUl", [IsLoad, IsZExtReturn, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_ld1">; +def SVLD1SW_VNUM : MInst<"svld1sw_vnum_{d}", "dPUl", "lUl", [IsLoad, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_ld1">; +def SVLD1UW_VNUM : MInst<"svld1uw_vnum_{d}", "dPYl", "lUl", [IsLoad, IsZExtReturn, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_ld1">; // Load one vector (vector base) def SVLD1_GATHER_BASES_U : MInst<"svld1_gather[_{2}base]_{d}", "dPu", "ilUiUlfd", [IsGatherLoad], MemEltTyDefault, "aarch64_sve_ld1_gather_scalar_offset">; @@ -487,27 +269,27 @@ } // Load one vector, unextended load, non-temporal (scalar base) -def SVLDNT1 : MInst<"svldnt1[_{2}]", "dPc", "csilUcUsUiUlhfd", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnt1">; +def SVLDNT1 : MInst<"svldnt1[_{2}]", "dPc", "csilUcUsUiUlhfd", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ldnt1">; // Load one vector, unextended load, non-temporal (scalar base, VL displacement) -def SVLDNT1_VNUM : MInst<"svldnt1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfd", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnt1">; +def SVLDNT1_VNUM : MInst<"svldnt1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfd", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ldnt1">; let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in { - def SVLDNT1_BF : MInst<"svldnt1[_{2}]", "dPc", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnt1">; - def SVLDNT1_VNUM_BF : MInst<"svldnt1_vnum[_{2}]", "dPcl", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnt1">; + def SVLDNT1_BF : MInst<"svldnt1[_{2}]", "dPc", "b", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ldnt1">; + def SVLDNT1_VNUM_BF : MInst<"svldnt1_vnum[_{2}]", "dPcl", "b", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ldnt1">; } // Load one quadword and replicate (scalar base) -def SVLD1RQ : SInst<"svld1rq[_{2}]", "dPc", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ld1rq">; +def SVLD1RQ : SInst<"svld1rq[_{2}]", "dPc", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ld1rq", [IsStreamingCompatible]>; let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in { - def SVLD1RQ_BF : SInst<"svld1rq[_{2}]", "dPc", "b", MergeNone, "aarch64_sve_ld1rq">; + def SVLD1RQ_BF : SInst<"svld1rq[_{2}]", "dPc", "b", MergeNone, "aarch64_sve_ld1rq", [IsStreamingCompatible]>; } multiclass StructLoad { - def : SInst; + def : SInst; let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in { - def: SInst; + def: SInst; } } @@ -530,42 +312,42 @@ } let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in { - def SVBFDOT : SInst<"svbfdot[_{0}]", "MMdd", "b", MergeNone, "aarch64_sve_bfdot", [IsOverloadNone]>; - def SVBFMLALB : SInst<"svbfmlalb[_{0}]", "MMdd", "b", MergeNone, "aarch64_sve_bfmlalb", [IsOverloadNone]>; - def SVBFMLALT : SInst<"svbfmlalt[_{0}]", "MMdd", "b", MergeNone, "aarch64_sve_bfmlalt", [IsOverloadNone]>; + def SVBFDOT : SInst<"svbfdot[_{0}]", "MMdd", "b", MergeNone, "aarch64_sve_bfdot", [IsOverloadNone, IsStreamingCompatible]>; + def SVBFMLALB : SInst<"svbfmlalb[_{0}]", "MMdd", "b", MergeNone, "aarch64_sve_bfmlalb", [IsOverloadNone, IsStreamingCompatible]>; + def SVBFMLALT : SInst<"svbfmlalt[_{0}]", "MMdd", "b", MergeNone, "aarch64_sve_bfmlalt", [IsOverloadNone, IsStreamingCompatible]>; def SVBFMMLA : SInst<"svbfmmla[_{0}]", "MMdd", "b", MergeNone, "aarch64_sve_bfmmla", [IsOverloadNone]>; - def SVBFDOT_N : SInst<"svbfdot[_n_{0}]", "MMda", "b", MergeNone, "aarch64_sve_bfdot", [IsOverloadNone]>; - def SVBFMLAL_N : SInst<"svbfmlalb[_n_{0}]", "MMda", "b", MergeNone, "aarch64_sve_bfmlalb", [IsOverloadNone]>; - def SVBFMLALT_N : SInst<"svbfmlalt[_n_{0}]", "MMda", "b", MergeNone, "aarch64_sve_bfmlalt", [IsOverloadNone]>; - def SVBFDOT_LANE : SInst<"svbfdot_lane[_{0}]", "MMddn", "b", MergeNone, "aarch64_sve_bfdot_lane", [IsOverloadNone], [ImmCheck<3, ImmCheck0_3>]>; - def SVBFMLALB_LANE : SInst<"svbfmlalb_lane[_{0}]", "MMddn", "b", MergeNone, "aarch64_sve_bfmlalb_lane", [IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>; - def SVBFMLALT_LANE : SInst<"svbfmlalt_lane[_{0}]", "MMddn", "b", MergeNone, "aarch64_sve_bfmlalt_lane", [IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>; + def SVBFDOT_N : SInst<"svbfdot[_n_{0}]", "MMda", "b", MergeNone, "aarch64_sve_bfdot", [IsOverloadNone, IsStreamingCompatible]>; + def SVBFMLAL_N : SInst<"svbfmlalb[_n_{0}]", "MMda", "b", MergeNone, "aarch64_sve_bfmlalb", [IsOverloadNone, IsStreamingCompatible]>; + def SVBFMLALT_N : SInst<"svbfmlalt[_n_{0}]", "MMda", "b", MergeNone, "aarch64_sve_bfmlalt", [IsOverloadNone, IsStreamingCompatible]>; + def SVBFDOT_LANE : SInst<"svbfdot_lane[_{0}]", "MMddn", "b", MergeNone, "aarch64_sve_bfdot_lane", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<3, ImmCheck0_3>]>; + def SVBFMLALB_LANE : SInst<"svbfmlalb_lane[_{0}]", "MMddn", "b", MergeNone, "aarch64_sve_bfmlalb_lane", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<3, ImmCheck0_7>]>; + def SVBFMLALT_LANE : SInst<"svbfmlalt_lane[_{0}]", "MMddn", "b", MergeNone, "aarch64_sve_bfmlalt_lane", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<3, ImmCheck0_7>]>; } //////////////////////////////////////////////////////////////////////////////// // Stores // Store one vector (scalar base) -def SVST1 : MInst<"svst1[_{d}]", "vPpd", "csilUcUsUiUlhfd", [IsStore], MemEltTyDefault, "aarch64_sve_st1">; -def SVST1B_S : MInst<"svst1b[_{d}]", "vPAd", "sil", [IsStore], MemEltTyInt8, "aarch64_sve_st1">; -def SVST1B_U : MInst<"svst1b[_{d}]", "vPEd", "UsUiUl", [IsStore], MemEltTyInt8, "aarch64_sve_st1">; -def SVST1H_S : MInst<"svst1h[_{d}]", "vPBd", "il", [IsStore], MemEltTyInt16, "aarch64_sve_st1">; -def SVST1H_U : MInst<"svst1h[_{d}]", "vPFd", "UiUl", [IsStore], MemEltTyInt16, "aarch64_sve_st1">; -def SVST1W_S : MInst<"svst1w[_{d}]", "vPCd", "l", [IsStore], MemEltTyInt32, "aarch64_sve_st1">; -def SVST1W_U : MInst<"svst1w[_{d}]", "vPGd", "Ul", [IsStore], MemEltTyInt32, "aarch64_sve_st1">; +def SVST1 : MInst<"svst1[_{d}]", "vPpd", "csilUcUsUiUlhfd", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_st1">; +def SVST1B_S : MInst<"svst1b[_{d}]", "vPAd", "sil", [IsStore, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_st1">; +def SVST1B_U : MInst<"svst1b[_{d}]", "vPEd", "UsUiUl", [IsStore, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_st1">; +def SVST1H_S : MInst<"svst1h[_{d}]", "vPBd", "il", [IsStore, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_st1">; +def SVST1H_U : MInst<"svst1h[_{d}]", "vPFd", "UiUl", [IsStore, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_st1">; +def SVST1W_S : MInst<"svst1w[_{d}]", "vPCd", "l", [IsStore, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_st1">; +def SVST1W_U : MInst<"svst1w[_{d}]", "vPGd", "Ul", [IsStore, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_st1">; // Store one vector (scalar base, VL displacement) -def SVST1_VNUM : MInst<"svst1_vnum[_{d}]", "vPpld", "csilUcUsUiUlhfd", [IsStore], MemEltTyDefault, "aarch64_sve_st1">; -def SVST1B_VNUM_S : MInst<"svst1b_vnum[_{d}]", "vPAld", "sil", [IsStore], MemEltTyInt8, "aarch64_sve_st1">; -def SVST1B_VNUM_U : MInst<"svst1b_vnum[_{d}]", "vPEld", "UsUiUl", [IsStore], MemEltTyInt8, "aarch64_sve_st1">; -def SVST1H_VNUM_S : MInst<"svst1h_vnum[_{d}]", "vPBld", "il", [IsStore], MemEltTyInt16, "aarch64_sve_st1">; -def SVST1H_VNUM_U : MInst<"svst1h_vnum[_{d}]", "vPFld", "UiUl", [IsStore], MemEltTyInt16, "aarch64_sve_st1">; -def SVST1W_VNUM_S : MInst<"svst1w_vnum[_{d}]", "vPCld", "l", [IsStore], MemEltTyInt32, "aarch64_sve_st1">; -def SVST1W_VNUM_U : MInst<"svst1w_vnum[_{d}]", "vPGld", "Ul", [IsStore], MemEltTyInt32, "aarch64_sve_st1">; +def SVST1_VNUM : MInst<"svst1_vnum[_{d}]", "vPpld", "csilUcUsUiUlhfd", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_st1">; +def SVST1B_VNUM_S : MInst<"svst1b_vnum[_{d}]", "vPAld", "sil", [IsStore, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_st1">; +def SVST1B_VNUM_U : MInst<"svst1b_vnum[_{d}]", "vPEld", "UsUiUl", [IsStore, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_st1">; +def SVST1H_VNUM_S : MInst<"svst1h_vnum[_{d}]", "vPBld", "il", [IsStore, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_st1">; +def SVST1H_VNUM_U : MInst<"svst1h_vnum[_{d}]", "vPFld", "UiUl", [IsStore, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_st1">; +def SVST1W_VNUM_S : MInst<"svst1w_vnum[_{d}]", "vPCld", "l", [IsStore, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_st1">; +def SVST1W_VNUM_U : MInst<"svst1w_vnum[_{d}]", "vPGld", "Ul", [IsStore, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_st1">; let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in { - def SVST1_BF : MInst<"svst1[_{d}]", "vPpd", "b", [IsStore], MemEltTyDefault, "aarch64_sve_st1">; - def SVST1_VNUM_BF : MInst<"svst1_vnum[_{d}]", "vPpld", "b", [IsStore], MemEltTyDefault, "aarch64_sve_st1">; + def SVST1_BF : MInst<"svst1[_{d}]", "vPpd", "b", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_st1">; + def SVST1_VNUM_BF : MInst<"svst1_vnum[_{d}]", "vPpld", "b", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_st1">; } // Store one vector (vector base) @@ -638,9 +420,9 @@ def SVST1W_SCATTER_INDEX_S : MInst<"svst1w_scatter[_{2}base]_index[_{d}]", "vPuld", "lUl", [IsScatterStore], MemEltTyInt32, "aarch64_sve_st1_scatter_scalar_offset">; multiclass StructStore { - def : SInst; + def : SInst; let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in { - def: SInst; + def: SInst; } } // Store N vectors into N-element structure (scalar base) @@ -654,30 +436,30 @@ defm SVST4_VNUM : StructStore<"svst4_vnum[_{d}]", "vPpl4", "aarch64_sve_st4">; // Store one vector, with no truncation, non-temporal (scalar base) -def SVSTNT1 : MInst<"svstnt1[_{d}]", "vPpd", "csilUcUsUiUlhfd", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">; +def SVSTNT1 : MInst<"svstnt1[_{d}]", "vPpd", "csilUcUsUiUlhfd", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_stnt1">; // Store one vector, with no truncation, non-temporal (scalar base, VL displacement) -def SVSTNT1_VNUM : MInst<"svstnt1_vnum[_{d}]", "vPpld", "csilUcUsUiUlhfd", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">; +def SVSTNT1_VNUM : MInst<"svstnt1_vnum[_{d}]", "vPpld", "csilUcUsUiUlhfd", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_stnt1">; let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in { - def SVSTNT1_BF : MInst<"svstnt1[_{d}]", "vPpd", "b", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">; - def SVSTNT1_VNUM_BF : MInst<"svstnt1_vnum[_{d}]", "vPpld", "b", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">; + def SVSTNT1_BF : MInst<"svstnt1[_{d}]", "vPpd", "b", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_stnt1">; + def SVSTNT1_VNUM_BF : MInst<"svstnt1_vnum[_{d}]", "vPpld", "b", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_stnt1">; } //////////////////////////////////////////////////////////////////////////////// // Prefetches // Prefetch (Scalar base) -def SVPRFB : MInst<"svprfb", "vPQJ", "c", [IsPrefetch], MemEltTyInt8, "aarch64_sve_prf">; -def SVPRFH : MInst<"svprfh", "vPQJ", "s", [IsPrefetch], MemEltTyInt16, "aarch64_sve_prf">; -def SVPRFW : MInst<"svprfw", "vPQJ", "i", [IsPrefetch], MemEltTyInt32, "aarch64_sve_prf">; -def SVPRFD : MInst<"svprfd", "vPQJ", "l", [IsPrefetch], MemEltTyInt64, "aarch64_sve_prf">; +def SVPRFB : MInst<"svprfb", "vPQJ", "c", [IsPrefetch, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_prf">; +def SVPRFH : MInst<"svprfh", "vPQJ", "s", [IsPrefetch, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_prf">; +def SVPRFW : MInst<"svprfw", "vPQJ", "i", [IsPrefetch, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_prf">; +def SVPRFD : MInst<"svprfd", "vPQJ", "l", [IsPrefetch, IsStreamingCompatible], MemEltTyInt64, "aarch64_sve_prf">; // Prefetch (Scalar base, VL displacement) -def SVPRFB_VNUM : MInst<"svprfb_vnum", "vPQlJ", "c", [IsPrefetch], MemEltTyInt8, "aarch64_sve_prf">; -def SVPRFH_VNUM : MInst<"svprfh_vnum", "vPQlJ", "s", [IsPrefetch], MemEltTyInt16, "aarch64_sve_prf">; -def SVPRFW_VNUM : MInst<"svprfw_vnum", "vPQlJ", "i", [IsPrefetch], MemEltTyInt32, "aarch64_sve_prf">; -def SVPRFD_VNUM : MInst<"svprfd_vnum", "vPQlJ", "l", [IsPrefetch], MemEltTyInt64, "aarch64_sve_prf">; +def SVPRFB_VNUM : MInst<"svprfb_vnum", "vPQlJ", "c", [IsPrefetch, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_prf">; +def SVPRFH_VNUM : MInst<"svprfh_vnum", "vPQlJ", "s", [IsPrefetch, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_prf">; +def SVPRFW_VNUM : MInst<"svprfw_vnum", "vPQlJ", "i", [IsPrefetch, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_prf">; +def SVPRFD_VNUM : MInst<"svprfd_vnum", "vPQlJ", "l", [IsPrefetch, IsStreamingCompatible], MemEltTyInt64, "aarch64_sve_prf">; // Prefetch (Vector bases) def SVPRFB_GATHER_BASES : MInst<"svprfb_gather[_{2}base]", "vPdJ", "UiUl", [IsGatherPrefetch], MemEltTyInt8, "aarch64_sve_prfb_gather_scalar_offset">; @@ -732,9 +514,9 @@ def SVDUPQ_64 : SInst<"svdupq[_n]_{d}", "dss", "lUld", MergeNone>; multiclass svdup_base { - def NAME : SInst; + def NAME : SInst; let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in { - def _BF16: SInst; + def _BF16: SInst; } } @@ -743,14 +525,14 @@ defm SVDUP_X : svdup_base<"svdup[_n]_{d}", "dPs", MergeAnyExp, "aarch64_sve_dup">; defm SVDUP_Z : svdup_base<"svdup[_n]_{d}", "dPs", MergeZeroExp, "aarch64_sve_dup">; -def SVINDEX : SInst<"svindex_{d}", "dss", "csilUcUsUiUl", MergeNone, "aarch64_sve_index">; +def SVINDEX : SInst<"svindex_{d}", "dss", "csilUcUsUiUl", MergeNone, "aarch64_sve_index", [IsStreamingCompatible]>; // Integer arithmetic -multiclass SInstZPZ flags=[]> { - def _M : SInst; - def _X : SInst; - def _Z : SInst; +multiclass SInstZPZ { + def _M : SInst; + def _X : SInst; + def _Z : SInst; } defm SVABS : SInstZPZ<"svabs", "csil", "aarch64_sve_abs">; @@ -758,14 +540,14 @@ //------------------------------------------------------------------------------ -multiclass SInstZPZZ flags=[]> { - def _M : SInst; - def _X : SInst; - def _Z : SInst; +multiclass SInstZPZZ { + def _M : SInst; + def _X : SInst; + def _Z : SInst; - def _N_M : SInst; - def _N_X : SInst; - def _N_Z : SInst; + def _N_M : SInst; + def _N_X : SInst; + def _N_Z : SInst; } defm SVABD_S : SInstZPZZ<"svabd", "csil", "aarch64_sve_sabd">; @@ -787,14 +569,14 @@ //------------------------------------------------------------------------------ -multiclass SInstZPZZZ flags=[]> { - def _M : SInst; - def _X : SInst; - def _Z : SInst; +multiclass SInstZPZZZ { + def _M : SInst; + def _X : SInst; + def _Z : SInst; - def _N_M : SInst; - def _N_X : SInst; - def _N_Z : SInst; + def _N_M : SInst; + def _N_X : SInst; + def _N_Z : SInst; } defm SVMAD : SInstZPZZZ<"svmad", "csilUcUsUiUl", "aarch64_sve_mad">; @@ -804,19 +586,19 @@ //------------------------------------------------------------------------------ -def SVDOT_S : SInst<"svdot[_{0}]", "ddqq", "il", MergeNone, "aarch64_sve_sdot">; -def SVDOT_U : SInst<"svdot[_{0}]", "ddqq", "UiUl", MergeNone, "aarch64_sve_udot">; -def SVQADD_S : SInst<"svqadd[_{d}]", "ddd", "csil", MergeNone, "aarch64_sve_sqadd_x">; -def SVQADD_U : SInst<"svqadd[_{d}]", "ddd", "UcUsUiUl", MergeNone, "aarch64_sve_uqadd_x">; -def SVQSUB_S : SInst<"svqsub[_{d}]", "ddd", "csil", MergeNone, "aarch64_sve_sqsub_x">; -def SVQSUB_U : SInst<"svqsub[_{d}]", "ddd", "UcUsUiUl", MergeNone, "aarch64_sve_uqsub_x">; +def SVDOT_S : SInst<"svdot[_{0}]", "ddqq", "il", MergeNone, "aarch64_sve_sdot", [IsStreamingCompatible]>; +def SVDOT_U : SInst<"svdot[_{0}]", "ddqq", "UiUl", MergeNone, "aarch64_sve_udot", [IsStreamingCompatible]>; +def SVQADD_S : SInst<"svqadd[_{d}]", "ddd", "csil", MergeNone, "aarch64_sve_sqadd_x", [IsStreamingCompatible]>; +def SVQADD_U : SInst<"svqadd[_{d}]", "ddd", "UcUsUiUl", MergeNone, "aarch64_sve_uqadd_x", [IsStreamingCompatible]>; +def SVQSUB_S : SInst<"svqsub[_{d}]", "ddd", "csil", MergeNone, "aarch64_sve_sqsub_x", [IsStreamingCompatible]>; +def SVQSUB_U : SInst<"svqsub[_{d}]", "ddd", "UcUsUiUl", MergeNone, "aarch64_sve_uqsub_x", [IsStreamingCompatible]>; -def SVDOT_N_S : SInst<"svdot[_n_{0}]", "ddqr", "il", MergeNone, "aarch64_sve_sdot">; -def SVDOT_N_U : SInst<"svdot[_n_{0}]", "ddqr", "UiUl", MergeNone, "aarch64_sve_udot">; -def SVQADD_N_S : SInst<"svqadd[_n_{d}]", "dda", "csil", MergeNone, "aarch64_sve_sqadd_x">; -def SVQADD_N_U : SInst<"svqadd[_n_{d}]", "dda", "UcUsUiUl", MergeNone, "aarch64_sve_uqadd_x">; -def SVQSUB_N_S : SInst<"svqsub[_n_{d}]", "dda", "csil", MergeNone, "aarch64_sve_sqsub_x">; -def SVQSUB_N_U : SInst<"svqsub[_n_{d}]", "dda", "UcUsUiUl", MergeNone, "aarch64_sve_uqsub_x">; +def SVDOT_N_S : SInst<"svdot[_n_{0}]", "ddqr", "il", MergeNone, "aarch64_sve_sdot", [IsStreamingCompatible]>; +def SVDOT_N_U : SInst<"svdot[_n_{0}]", "ddqr", "UiUl", MergeNone, "aarch64_sve_udot", [IsStreamingCompatible]>; +def SVQADD_N_S : SInst<"svqadd[_n_{d}]", "dda", "csil", MergeNone, "aarch64_sve_sqadd_x", [IsStreamingCompatible]>; +def SVQADD_N_U : SInst<"svqadd[_n_{d}]", "dda", "UcUsUiUl", MergeNone, "aarch64_sve_uqadd_x", [IsStreamingCompatible]>; +def SVQSUB_N_S : SInst<"svqsub[_n_{d}]", "dda", "csil", MergeNone, "aarch64_sve_sqsub_x", [IsStreamingCompatible]>; +def SVQSUB_N_U : SInst<"svqsub[_n_{d}]", "dda", "UcUsUiUl", MergeNone, "aarch64_sve_uqsub_x", [IsStreamingCompatible]>; def SVDOT_LANE_S : SInst<"svdot_lane[_{d}]", "ddqqi", "il", MergeNone, "aarch64_sve_sdot_lane", [], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>; def SVDOT_LANE_U : SInst<"svdot_lane[_{d}]", "ddqqi", "UiUl", MergeNone, "aarch64_sve_udot_lane", [], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>; @@ -836,107 +618,107 @@ // Shifts multiclass SInst_SHIFT { - def _M : SInst; - def _X : SInst; - def _Z : SInst; + def _M : SInst; + def _X : SInst; + def _Z : SInst; - def _N_M : SInst; - def _N_X : SInst; - def _N_Z : SInst; + def _N_M : SInst; + def _N_X : SInst; + def _N_Z : SInst; - def _WIDE_M : SInst; - def _WIDE_X : SInst; - def _WIDE_Z : SInst; + def _WIDE_M : SInst; + def _WIDE_X : SInst; + def _WIDE_Z : SInst; - def _WIDE_N_M : SInst; - def _WIDE_N_X : SInst; - def _WIDE_N_Z : SInst; + def _WIDE_N_M : SInst; + def _WIDE_N_X : SInst; + def _WIDE_N_Z : SInst; } defm SVASR : SInst_SHIFT<"svasr", "aarch64_sve_asr", "csil", "csi">; defm SVLSL : SInst_SHIFT<"svlsl", "aarch64_sve_lsl", "csilUcUsUiUl", "csiUcUsUi">; defm SVLSR : SInst_SHIFT<"svlsr", "aarch64_sve_lsr", "UcUsUiUl", "UcUsUi">; -def SVASRD_M : SInst<"svasrd[_n_{d}]", "dPdi", "csil", MergeOp1, "aarch64_sve_asrd", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVASRD_X : SInst<"svasrd[_n_{d}]", "dPdi", "csil", MergeAny, "aarch64_sve_asrd", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVASRD_Z : SInst<"svasrd[_n_{d}]", "dPdi", "csil", MergeZero, "aarch64_sve_asrd", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVASRD_M : SInst<"svasrd[_n_{d}]", "dPdi", "csil", MergeOp1, "aarch64_sve_asrd", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVASRD_X : SInst<"svasrd[_n_{d}]", "dPdi", "csil", MergeAny, "aarch64_sve_asrd", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVASRD_Z : SInst<"svasrd[_n_{d}]", "dPdi", "csil", MergeZero, "aarch64_sve_asrd", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVINSR : SInst<"svinsr[_n_{d}]", "dds", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_insr">; +def SVINSR : SInst<"svinsr[_n_{d}]", "dds", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_insr", [IsStreamingCompatible]>; let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in { - def SVINSR_BF16 : SInst<"svinsr[_n_{d}]", "dds", "b", MergeNone, "aarch64_sve_insr">; + def SVINSR_BF16 : SInst<"svinsr[_n_{d}]", "dds", "b", MergeNone, "aarch64_sve_insr", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// // Integer reductions -def SVADDV_S : SInst<"svaddv[_{d}]", "lPd", "csil", MergeNone, "aarch64_sve_saddv">; -def SVADDV_U : SInst<"svaddv[_{d}]", "nPd", "UcUsUiUl", MergeNone, "aarch64_sve_uaddv">; -def SVANDV : SInst<"svandv[_{d}]", "sPd", "csilUcUsUiUl", MergeNone, "aarch64_sve_andv">; -def SVEORV : SInst<"sveorv[_{d}]", "sPd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorv">; -def SVMAXV_S : SInst<"svmaxv[_{d}]", "sPd", "csil", MergeNone, "aarch64_sve_smaxv">; -def SVMAXV_U : SInst<"svmaxv[_{d}]", "sPd", "UcUsUiUl", MergeNone, "aarch64_sve_umaxv">; -def SVMINV_S : SInst<"svminv[_{d}]", "sPd", "csil", MergeNone, "aarch64_sve_sminv">; -def SVMINV_U : SInst<"svminv[_{d}]", "sPd", "UcUsUiUl", MergeNone, "aarch64_sve_uminv">; -def SVORV : SInst<"svorv[_{d}]", "sPd", "csilUcUsUiUl", MergeNone, "aarch64_sve_orv">; +def SVADDV_S : SInst<"svaddv[_{d}]", "lPd", "csil", MergeNone, "aarch64_sve_saddv", [IsStreamingCompatible]>; +def SVADDV_U : SInst<"svaddv[_{d}]", "nPd", "UcUsUiUl", MergeNone, "aarch64_sve_uaddv", [IsStreamingCompatible]>; +def SVANDV : SInst<"svandv[_{d}]", "sPd", "csilUcUsUiUl", MergeNone, "aarch64_sve_andv", [IsStreamingCompatible]>; +def SVEORV : SInst<"sveorv[_{d}]", "sPd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorv", [IsStreamingCompatible]>; +def SVMAXV_S : SInst<"svmaxv[_{d}]", "sPd", "csil", MergeNone, "aarch64_sve_smaxv", [IsStreamingCompatible]>; +def SVMAXV_U : SInst<"svmaxv[_{d}]", "sPd", "UcUsUiUl", MergeNone, "aarch64_sve_umaxv", [IsStreamingCompatible]>; +def SVMINV_S : SInst<"svminv[_{d}]", "sPd", "csil", MergeNone, "aarch64_sve_sminv", [IsStreamingCompatible]>; +def SVMINV_U : SInst<"svminv[_{d}]", "sPd", "UcUsUiUl", MergeNone, "aarch64_sve_uminv", [IsStreamingCompatible]>; +def SVORV : SInst<"svorv[_{d}]", "sPd", "csilUcUsUiUl", MergeNone, "aarch64_sve_orv", [IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // Integer comparisons -def SVCMPEQ : SInst<"svcmpeq[_{d}]", "PPdd", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpeq">; -def SVCMPNE : SInst<"svcmpne[_{d}]", "PPdd", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpne">; -def SVCMPGE : SInst<"svcmpge[_{d}]", "PPdd", "csil", MergeNone, "aarch64_sve_cmpge">; -def SVCMPGT : SInst<"svcmpgt[_{d}]", "PPdd", "csil", MergeNone, "aarch64_sve_cmpgt">; -def SVCMPLE : SInst<"svcmple[_{d}]", "PPdd", "csil", MergeNone, "aarch64_sve_cmpge", [ReverseCompare]>; -def SVCMPLT : SInst<"svcmplt[_{d}]", "PPdd", "csil", MergeNone, "aarch64_sve_cmpgt", [ReverseCompare]>; -def SVCMPHI : SInst<"svcmpgt[_{d}]", "PPdd", "UcUsUiUl", MergeNone, "aarch64_sve_cmphi">; -def SVCMPHS : SInst<"svcmpge[_{d}]", "PPdd", "UcUsUiUl", MergeNone, "aarch64_sve_cmphs">; -def SVCMPLO : SInst<"svcmplt[_{d}]", "PPdd", "UcUsUiUl", MergeNone, "aarch64_sve_cmphi", [ReverseCompare]>; -def SVCMPLS : SInst<"svcmple[_{d}]", "PPdd", "UcUsUiUl", MergeNone, "aarch64_sve_cmphs", [ReverseCompare]>; - -def SVCMPEQ_N : SInst<"svcmpeq[_n_{d}]", "PPda", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpeq">; -def SVCMPNE_N : SInst<"svcmpne[_n_{d}]", "PPda", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpne">; -def SVCMPGE_N : SInst<"svcmpge[_n_{d}]", "PPda", "csil", MergeNone, "aarch64_sve_cmpge">; -def SVCMPGT_N : SInst<"svcmpgt[_n_{d}]", "PPda", "csil", MergeNone, "aarch64_sve_cmpgt">; -def SVCMPLE_N : SInst<"svcmple[_n_{d}]", "PPda", "csil", MergeNone, "aarch64_sve_cmpge", [ReverseCompare]>; -def SVCMPLT_N : SInst<"svcmplt[_n_{d}]", "PPda", "csil", MergeNone, "aarch64_sve_cmpgt", [ReverseCompare]>; -def SVCMPHS_N : SInst<"svcmpge[_n_{d}]", "PPda", "UcUsUiUl", MergeNone, "aarch64_sve_cmphs">; -def SVCMPHI_N : SInst<"svcmpgt[_n_{d}]", "PPda", "UcUsUiUl", MergeNone, "aarch64_sve_cmphi">; -def SVCMPLS_N : SInst<"svcmple[_n_{d}]", "PPda", "UcUsUiUl", MergeNone, "aarch64_sve_cmphs", [ReverseCompare]>; -def SVCMPLO_N : SInst<"svcmplt[_n_{d}]", "PPda", "UcUsUiUl", MergeNone, "aarch64_sve_cmphi", [ReverseCompare]>; - -def SVCMPEQ_WIDE : SInst<"svcmpeq_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmpeq_wide">; -def SVCMPNE_WIDE : SInst<"svcmpne_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmpne_wide">; -def SVCMPGE_WIDE : SInst<"svcmpge_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmpge_wide">; -def SVCMPGT_WIDE : SInst<"svcmpgt_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmpgt_wide">; -def SVCMPLE_WIDE : SInst<"svcmple_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmple_wide">; -def SVCMPLT_WIDE : SInst<"svcmplt_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmplt_wide">; -def SVCMPHI_WIDE : SInst<"svcmpgt_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmphi_wide">; -def SVCMPHS_WIDE : SInst<"svcmpge_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmphs_wide">; -def SVCMPLO_WIDE : SInst<"svcmplt_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmplo_wide">; -def SVCMPLS_WIDE : SInst<"svcmple_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmpls_wide">; - -def SVCMPEQ_WIDE_N : SInst<"svcmpeq_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmpeq_wide">; -def SVCMPNE_WIDE_N : SInst<"svcmpne_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmpne_wide">; -def SVCMPGE_WIDE_N : SInst<"svcmpge_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmpge_wide">; -def SVCMPGT_WIDE_N : SInst<"svcmpgt_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmpgt_wide">; -def SVCMPLE_WIDE_N : SInst<"svcmple_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmple_wide">; -def SVCMPLT_WIDE_N : SInst<"svcmplt_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmplt_wide">; -def SVCMPHS_WIDE_N : SInst<"svcmpge_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmphs_wide">; -def SVCMPHI_WIDE_N : SInst<"svcmpgt_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmphi_wide">; -def SVCMPLO_WIDE_N : SInst<"svcmplt_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmplo_wide">; -def SVCMPLS_WIDE_N : SInst<"svcmple_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmpls_wide">; +def SVCMPEQ : SInst<"svcmpeq[_{d}]", "PPdd", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpeq", [IsStreamingCompatible]>; +def SVCMPNE : SInst<"svcmpne[_{d}]", "PPdd", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpne", [IsStreamingCompatible]>; +def SVCMPGE : SInst<"svcmpge[_{d}]", "PPdd", "csil", MergeNone, "aarch64_sve_cmpge", [IsStreamingCompatible]>; +def SVCMPGT : SInst<"svcmpgt[_{d}]", "PPdd", "csil", MergeNone, "aarch64_sve_cmpgt", [IsStreamingCompatible]>; +def SVCMPLE : SInst<"svcmple[_{d}]", "PPdd", "csil", MergeNone, "aarch64_sve_cmpge", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPLT : SInst<"svcmplt[_{d}]", "PPdd", "csil", MergeNone, "aarch64_sve_cmpgt", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPHI : SInst<"svcmpgt[_{d}]", "PPdd", "UcUsUiUl", MergeNone, "aarch64_sve_cmphi", [IsStreamingCompatible]>; +def SVCMPHS : SInst<"svcmpge[_{d}]", "PPdd", "UcUsUiUl", MergeNone, "aarch64_sve_cmphs", [IsStreamingCompatible]>; +def SVCMPLO : SInst<"svcmplt[_{d}]", "PPdd", "UcUsUiUl", MergeNone, "aarch64_sve_cmphi", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPLS : SInst<"svcmple[_{d}]", "PPdd", "UcUsUiUl", MergeNone, "aarch64_sve_cmphs", [ReverseCompare, IsStreamingCompatible]>; + +def SVCMPEQ_N : SInst<"svcmpeq[_n_{d}]", "PPda", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpeq", [IsStreamingCompatible]>; +def SVCMPNE_N : SInst<"svcmpne[_n_{d}]", "PPda", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpne", [IsStreamingCompatible]>; +def SVCMPGE_N : SInst<"svcmpge[_n_{d}]", "PPda", "csil", MergeNone, "aarch64_sve_cmpge", [IsStreamingCompatible]>; +def SVCMPGT_N : SInst<"svcmpgt[_n_{d}]", "PPda", "csil", MergeNone, "aarch64_sve_cmpgt", [IsStreamingCompatible]>; +def SVCMPLE_N : SInst<"svcmple[_n_{d}]", "PPda", "csil", MergeNone, "aarch64_sve_cmpge", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPLT_N : SInst<"svcmplt[_n_{d}]", "PPda", "csil", MergeNone, "aarch64_sve_cmpgt", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPHS_N : SInst<"svcmpge[_n_{d}]", "PPda", "UcUsUiUl", MergeNone, "aarch64_sve_cmphs", [IsStreamingCompatible]>; +def SVCMPHI_N : SInst<"svcmpgt[_n_{d}]", "PPda", "UcUsUiUl", MergeNone, "aarch64_sve_cmphi", [IsStreamingCompatible]>; +def SVCMPLS_N : SInst<"svcmple[_n_{d}]", "PPda", "UcUsUiUl", MergeNone, "aarch64_sve_cmphs", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPLO_N : SInst<"svcmplt[_n_{d}]", "PPda", "UcUsUiUl", MergeNone, "aarch64_sve_cmphi", [ReverseCompare, IsStreamingCompatible]>; + +def SVCMPEQ_WIDE : SInst<"svcmpeq_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmpeq_wide", [IsStreamingCompatible]>; +def SVCMPNE_WIDE : SInst<"svcmpne_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmpne_wide", [IsStreamingCompatible]>; +def SVCMPGE_WIDE : SInst<"svcmpge_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmpge_wide", [IsStreamingCompatible]>; +def SVCMPGT_WIDE : SInst<"svcmpgt_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmpgt_wide", [IsStreamingCompatible]>; +def SVCMPLE_WIDE : SInst<"svcmple_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmple_wide", [IsStreamingCompatible]>; +def SVCMPLT_WIDE : SInst<"svcmplt_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmplt_wide", [IsStreamingCompatible]>; +def SVCMPHI_WIDE : SInst<"svcmpgt_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmphi_wide", [IsStreamingCompatible]>; +def SVCMPHS_WIDE : SInst<"svcmpge_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmphs_wide", [IsStreamingCompatible]>; +def SVCMPLO_WIDE : SInst<"svcmplt_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmplo_wide", [IsStreamingCompatible]>; +def SVCMPLS_WIDE : SInst<"svcmple_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmpls_wide", [IsStreamingCompatible]>; + +def SVCMPEQ_WIDE_N : SInst<"svcmpeq_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmpeq_wide", [IsStreamingCompatible]>; +def SVCMPNE_WIDE_N : SInst<"svcmpne_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmpne_wide", [IsStreamingCompatible]>; +def SVCMPGE_WIDE_N : SInst<"svcmpge_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmpge_wide", [IsStreamingCompatible]>; +def SVCMPGT_WIDE_N : SInst<"svcmpgt_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmpgt_wide", [IsStreamingCompatible]>; +def SVCMPLE_WIDE_N : SInst<"svcmple_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmple_wide", [IsStreamingCompatible]>; +def SVCMPLT_WIDE_N : SInst<"svcmplt_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmplt_wide", [IsStreamingCompatible]>; +def SVCMPHS_WIDE_N : SInst<"svcmpge_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmphs_wide", [IsStreamingCompatible]>; +def SVCMPHI_WIDE_N : SInst<"svcmpgt_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmphi_wide", [IsStreamingCompatible]>; +def SVCMPLO_WIDE_N : SInst<"svcmplt_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmplo_wide", [IsStreamingCompatible]>; +def SVCMPLS_WIDE_N : SInst<"svcmple_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmpls_wide", [IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // While comparisons -def SVWHILELE_S32 : SInst<"svwhilele_{d}[_{1}]", "Pkk", "PcPsPiPl", MergeNone, "aarch64_sve_whilele", [IsOverloadWhile]>; -def SVWHILELE_S64 : SInst<"svwhilele_{d}[_{1}]", "Pll", "PcPsPiPl", MergeNone, "aarch64_sve_whilele", [IsOverloadWhile]>; -def SVWHILELO_U32 : SInst<"svwhilelt_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilelo", [IsOverloadWhile]>; -def SVWHILELO_U64 : SInst<"svwhilelt_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilelo", [IsOverloadWhile]>; -def SVWHILELS_U32 : SInst<"svwhilele_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilels", [IsOverloadWhile]>; -def SVWHILELS_U64 : SInst<"svwhilele_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilels", [IsOverloadWhile]>; -def SVWHILELT_S32 : SInst<"svwhilelt_{d}[_{1}]", "Pkk", "PcPsPiPl", MergeNone, "aarch64_sve_whilelt", [IsOverloadWhile]>; -def SVWHILELT_S64 : SInst<"svwhilelt_{d}[_{1}]", "Pll", "PcPsPiPl", MergeNone, "aarch64_sve_whilelt", [IsOverloadWhile]>; +def SVWHILELE_S32 : SInst<"svwhilele_{d}[_{1}]", "Pkk", "PcPsPiPl", MergeNone, "aarch64_sve_whilele", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILELE_S64 : SInst<"svwhilele_{d}[_{1}]", "Pll", "PcPsPiPl", MergeNone, "aarch64_sve_whilele", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILELO_U32 : SInst<"svwhilelt_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilelo", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILELO_U64 : SInst<"svwhilelt_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilelo", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILELS_U32 : SInst<"svwhilele_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilels", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILELS_U64 : SInst<"svwhilele_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilels", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILELT_S32 : SInst<"svwhilelt_{d}[_{1}]", "Pkk", "PcPsPiPl", MergeNone, "aarch64_sve_whilelt", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILELT_S64 : SInst<"svwhilelt_{d}[_{1}]", "Pll", "PcPsPiPl", MergeNone, "aarch64_sve_whilelt", [IsOverloadWhile, IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // Counting bit @@ -947,12 +729,12 @@ def _Z : SInst; } -defm SVCLS : SInstCLS<"svcls", "csil", "aarch64_sve_cls">; -defm SVCLZ : SInstCLS<"svclz", "csilUcUsUiUl", "aarch64_sve_clz">; -defm SVCNT : SInstCLS<"svcnt", "csilUcUsUiUlhfd", "aarch64_sve_cnt">; +defm SVCLS : SInstCLS<"svcls", "csil", "aarch64_sve_cls", [IsStreamingCompatible]>; +defm SVCLZ : SInstCLS<"svclz", "csilUcUsUiUl", "aarch64_sve_clz", [IsStreamingCompatible]>; +defm SVCNT : SInstCLS<"svcnt", "csilUcUsUiUlhfd", "aarch64_sve_cnt", [IsStreamingCompatible]>; let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in { - defm SVCNT_BF16 : SInstCLS<"svcnt", "b", "aarch64_sve_cnt">; + defm SVCNT_BF16 : SInstCLS<"svcnt", "b", "aarch64_sve_cnt", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// @@ -1007,13 +789,13 @@ def SVTSMUL : SInst<"svtsmul[_{d}]", "ddu", "hfd", MergeNone, "aarch64_sve_ftsmul_x">; def SVTSSEL : SInst<"svtssel[_{d}]", "ddu", "hfd", MergeNone, "aarch64_sve_ftssel_x">; -def SVSCALE_M : SInst<"svscale[_{d}]", "dPdx", "hfd", MergeOp1, "aarch64_sve_fscale">; -def SVSCALE_X : SInst<"svscale[_{d}]", "dPdx", "hfd", MergeAny, "aarch64_sve_fscale">; -def SVSCALE_Z : SInst<"svscale[_{d}]", "dPdx", "hfd", MergeZero, "aarch64_sve_fscale">; +def SVSCALE_M : SInst<"svscale[_{d}]", "dPdx", "hfd", MergeOp1, "aarch64_sve_fscale", [IsStreamingCompatible]>; +def SVSCALE_X : SInst<"svscale[_{d}]", "dPdx", "hfd", MergeAny, "aarch64_sve_fscale", [IsStreamingCompatible]>; +def SVSCALE_Z : SInst<"svscale[_{d}]", "dPdx", "hfd", MergeZero, "aarch64_sve_fscale", [IsStreamingCompatible]>; -def SVSCALE_N_M : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeOp1, "aarch64_sve_fscale">; -def SVSCALE_N_X : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeAny, "aarch64_sve_fscale">; -def SVSCALE_N_Z : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeZero, "aarch64_sve_fscale">; +def SVSCALE_N_M : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeOp1, "aarch64_sve_fscale", [IsStreamingCompatible]>; +def SVSCALE_N_X : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeAny, "aarch64_sve_fscale", [IsStreamingCompatible]>; +def SVSCALE_N_Z : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeZero, "aarch64_sve_fscale", [IsStreamingCompatible]>; defm SVMAD_F : SInstZPZZZ<"svmad", "hfd", "aarch64_sve_fmad">; defm SVMLA_F : SInstZPZZZ<"svmla", "hfd", "aarch64_sve_fmla">; @@ -1024,42 +806,42 @@ defm SVNMLS_F : SInstZPZZZ<"svnmls", "hfd", "aarch64_sve_fnmls">; defm SVNMSB_F : SInstZPZZZ<"svnmsb", "hfd", "aarch64_sve_fnmsb">; -def SVCADD_M : SInst<"svcadd[_{d}]", "dPddi", "hfd", MergeOp1, "aarch64_sve_fcadd", [], [ImmCheck<3, ImmCheckComplexRot90_270>]>; -def SVCADD_X : SInst<"svcadd[_{d}]", "dPddi", "hfd", MergeAny, "aarch64_sve_fcadd", [], [ImmCheck<3, ImmCheckComplexRot90_270>]>; -def SVCADD_Z : SInst<"svcadd[_{d}]", "dPddi", "hfd", MergeZero, "aarch64_sve_fcadd", [], [ImmCheck<3, ImmCheckComplexRot90_270>]>; -def SVCMLA_M : SInst<"svcmla[_{d}]", "dPdddi", "hfd", MergeOp1, "aarch64_sve_fcmla", [], [ImmCheck<4, ImmCheckComplexRotAll90>]>; -def SVCMLA_X : SInst<"svcmla[_{d}]", "dPdddi", "hfd", MergeAny, "aarch64_sve_fcmla", [], [ImmCheck<4, ImmCheckComplexRotAll90>]>; -def SVCMLA_Z : SInst<"svcmla[_{d}]", "dPdddi", "hfd", MergeZero, "aarch64_sve_fcmla", [], [ImmCheck<4, ImmCheckComplexRotAll90>]>; +def SVCADD_M : SInst<"svcadd[_{d}]", "dPddi", "hfd", MergeOp1, "aarch64_sve_fcadd", [IsStreamingCompatible], [ImmCheck<3, ImmCheckComplexRot90_270>]>; +def SVCADD_X : SInst<"svcadd[_{d}]", "dPddi", "hfd", MergeAny, "aarch64_sve_fcadd", [IsStreamingCompatible], [ImmCheck<3, ImmCheckComplexRot90_270>]>; +def SVCADD_Z : SInst<"svcadd[_{d}]", "dPddi", "hfd", MergeZero, "aarch64_sve_fcadd", [IsStreamingCompatible], [ImmCheck<3, ImmCheckComplexRot90_270>]>; +def SVCMLA_M : SInst<"svcmla[_{d}]", "dPdddi", "hfd", MergeOp1, "aarch64_sve_fcmla", [IsStreamingCompatible], [ImmCheck<4, ImmCheckComplexRotAll90>]>; +def SVCMLA_X : SInst<"svcmla[_{d}]", "dPdddi", "hfd", MergeAny, "aarch64_sve_fcmla", [IsStreamingCompatible], [ImmCheck<4, ImmCheckComplexRotAll90>]>; +def SVCMLA_Z : SInst<"svcmla[_{d}]", "dPdddi", "hfd", MergeZero, "aarch64_sve_fcmla", [IsStreamingCompatible], [ImmCheck<4, ImmCheckComplexRotAll90>]>; -def SVCMLA_LANE : SInst<"svcmla_lane[_{d}]", "ddddii", "hf", MergeNone, "aarch64_sve_fcmla_lane", [], [ImmCheck<3, ImmCheckLaneIndexCompRotate, 2>, +def SVCMLA_LANE : SInst<"svcmla_lane[_{d}]", "ddddii", "hf", MergeNone, "aarch64_sve_fcmla_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndexCompRotate, 2>, ImmCheck<4, ImmCheckComplexRotAll90>]>; -def SVMLA_LANE : SInst<"svmla_lane[_{d}]", "ddddi", "hfd", MergeNone, "aarch64_sve_fmla_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLS_LANE : SInst<"svmls_lane[_{d}]", "ddddi", "hfd", MergeNone, "aarch64_sve_fmls_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMUL_LANE : SInst<"svmul_lane[_{d}]", "dddi", "hfd", MergeNone, "aarch64_sve_fmul_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVMLA_LANE : SInst<"svmla_lane[_{d}]", "ddddi", "hfd", MergeNone, "aarch64_sve_fmla_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLS_LANE : SInst<"svmls_lane[_{d}]", "ddddi", "hfd", MergeNone, "aarch64_sve_fmls_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMUL_LANE : SInst<"svmul_lane[_{d}]", "dddi", "hfd", MergeNone, "aarch64_sve_fmul_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; -def SVRECPE : SInst<"svrecpe[_{d}]", "dd", "hfd", MergeNone, "aarch64_sve_frecpe_x">; -def SVRECPS : SInst<"svrecps[_{d}]", "ddd", "hfd", MergeNone, "aarch64_sve_frecps_x">; -def SVRSQRTE : SInst<"svrsqrte[_{d}]", "dd", "hfd", MergeNone, "aarch64_sve_frsqrte_x">; -def SVRSQRTS : SInst<"svrsqrts[_{d}]", "ddd", "hfd", MergeNone, "aarch64_sve_frsqrts_x">; +def SVRECPE : SInst<"svrecpe[_{d}]", "dd", "hfd", MergeNone, "aarch64_sve_frecpe_x", [IsStreamingCompatible]>; +def SVRECPS : SInst<"svrecps[_{d}]", "ddd", "hfd", MergeNone, "aarch64_sve_frecps_x", [IsStreamingCompatible]>; +def SVRSQRTE : SInst<"svrsqrte[_{d}]", "dd", "hfd", MergeNone, "aarch64_sve_frsqrte_x", [IsStreamingCompatible]>; +def SVRSQRTS : SInst<"svrsqrts[_{d}]", "ddd", "hfd", MergeNone, "aarch64_sve_frsqrts_x", [IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // Floating-point reductions def SVFADDA : SInst<"svadda[_{d}]", "sPsd", "hfd", MergeNone, "aarch64_sve_fadda">; -def SVFADDV : SInst<"svaddv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_faddv">; -def SVFMAXV : SInst<"svmaxv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_fmaxv">; -def SVFMAXNMV : SInst<"svmaxnmv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_fmaxnmv">; -def SVFMINV : SInst<"svminv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_fminv">; -def SVFMINNMV : SInst<"svminnmv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_fminnmv">; +def SVFADDV : SInst<"svaddv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_faddv", [IsStreamingCompatible]>; +def SVFMAXV : SInst<"svmaxv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_fmaxv", [IsStreamingCompatible]>; +def SVFMAXNMV : SInst<"svmaxnmv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_fmaxnmv", [IsStreamingCompatible]>; +def SVFMINV : SInst<"svminv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_fminv", [IsStreamingCompatible]>; +def SVFMINNMV : SInst<"svminnmv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_fminnmv", [IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // Floating-point comparisons -def SVACGE : SInst<"svacge[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_facge">; -def SVACGT : SInst<"svacgt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_facgt">; -def SVACLE : SInst<"svacle[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_facge", [ReverseCompare]>; -def SVACLT : SInst<"svaclt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_facgt", [ReverseCompare]>; -def SVCMPUO : SInst<"svcmpuo[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpuo">; +def SVACGE : SInst<"svacge[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_facge", [IsStreamingCompatible]>; +def SVACGT : SInst<"svacgt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_facgt", [IsStreamingCompatible]>; +def SVACLE : SInst<"svacle[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_facge", [ReverseCompare, IsStreamingCompatible]>; +def SVACLT : SInst<"svaclt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_facgt", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPUO : SInst<"svcmpuo[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpuo", [IsStreamingCompatible]>; def SVACGE_N : SInst<"svacge[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_facge">; def SVACGT_N : SInst<"svacgt[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_facgt">; @@ -1067,19 +849,19 @@ def SVACLT_N : SInst<"svaclt[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_facgt", [ReverseCompare]>; def SVCMPUO_N : SInst<"svcmpuo[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpuo">; -def SVCMPEQ_F : SInst<"svcmpeq[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpeq">; -def SVCMPNE_F : SInst<"svcmpne[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpne">; -def SVCMPGE_F : SInst<"svcmpge[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpge">; -def SVCMPGT_F : SInst<"svcmpgt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpgt">; -def SVCMPLE_F : SInst<"svcmple[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpge", [ReverseCompare]>; -def SVCMPLT_F : SInst<"svcmplt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpgt", [ReverseCompare]>; +def SVCMPEQ_F : SInst<"svcmpeq[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpeq", [IsStreamingCompatible]>; +def SVCMPNE_F : SInst<"svcmpne[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpne", [IsStreamingCompatible]>; +def SVCMPGE_F : SInst<"svcmpge[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpge", [IsStreamingCompatible]>; +def SVCMPGT_F : SInst<"svcmpgt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpgt", [IsStreamingCompatible]>; +def SVCMPLE_F : SInst<"svcmple[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpge", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPLT_F : SInst<"svcmplt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpgt", [ReverseCompare, IsStreamingCompatible]>; -def SVCMPEQ_F_N : SInst<"svcmpeq[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpeq">; -def SVCMPNE_F_N : SInst<"svcmpne[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpne">; -def SVCMPGE_F_N : SInst<"svcmpge[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpge">; -def SVCMPGT_F_N : SInst<"svcmpgt[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpgt">; -def SVCMPLE_F_N : SInst<"svcmple[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpge", [ReverseCompare]>; -def SVCMPLT_F_N : SInst<"svcmplt[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpgt", [ReverseCompare]>; +def SVCMPEQ_F_N : SInst<"svcmpeq[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpeq", [IsStreamingCompatible]>; +def SVCMPNE_F_N : SInst<"svcmpne[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpne", [IsStreamingCompatible]>; +def SVCMPGE_F_N : SInst<"svcmpge[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpge", [IsStreamingCompatible]>; +def SVCMPGT_F_N : SInst<"svcmpgt[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpgt", [IsStreamingCompatible]>; +def SVCMPLE_F_N : SInst<"svcmple[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpge", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPLT_F_N : SInst<"svcmplt[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpgt", [ReverseCompare, IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // Floating-point conversions @@ -1087,16 +869,16 @@ multiclass SInstCvtMXZ< string name, string m_types, string xz_types, string types, string intrinsic, list flags = [IsOverloadNone]> { - def _M : SInst; - def _X : SInst; - def _Z : SInst; + def _M : SInst; + def _X : SInst; + def _Z : SInst; } multiclass SInstCvtMX flags = [IsOverloadNone]> { - def _M : SInst; - def _X : SInst; + def _M : SInst; + def _X : SInst; } // svcvt_s##_f16 @@ -1110,7 +892,7 @@ let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in { defm SVCVT_BF16_F32 : SInstCvtMXZ<"svcvt_bf16[_f32]", "ddPM", "dPM", "b", "aarch64_sve_fcvt_bf16f32">; - def SVCVTNT_BF16_F32 : SInst<"svcvtnt_bf16[_f32]", "ddPM", "b", MergeOp1, "aarch64_sve_fcvtnt_bf16f32", [IsOverloadNone]>; + def SVCVTNT_BF16_F32 : SInst<"svcvtnt_bf16[_f32]", "ddPM", "b", MergeOp1, "aarch64_sve_fcvtnt_bf16f32", [IsOverloadNone, IsStreamingCompatible]>; } // svcvt_s##_f64 @@ -1174,11 +956,11 @@ defm SVCVTX_F32 : SInstCvtMXZ<"svcvtx_f32[_f64]", "MMPd", "MPd", "d", "aarch64_sve_fcvtx_f32f64">; -def SVCVTNT_F32 : SInst<"svcvtnt_f16[_f32]", "hhPd", "f", MergeOp1, "aarch64_sve_fcvtnt_f16f32", [IsOverloadNone]>; -def SVCVTNT_F64 : SInst<"svcvtnt_f32[_f64]", "hhPd", "d", MergeOp1, "aarch64_sve_fcvtnt_f32f64", [IsOverloadNone]>; +def SVCVTNT_F32 : SInst<"svcvtnt_f16[_f32]", "hhPd", "f", MergeOp1, "aarch64_sve_fcvtnt_f16f32", [IsOverloadNone, IsStreamingCompatible]>; +def SVCVTNT_F64 : SInst<"svcvtnt_f32[_f64]", "hhPd", "d", MergeOp1, "aarch64_sve_fcvtnt_f32f64", [IsOverloadNone, IsStreamingCompatible]>; // SVCVTNT_X : Implemented as macro by SveEmitter.cpp -def SVCVTXNT_F32 : SInst<"svcvtxnt_f32[_f64]", "MMPd", "d", MergeOp1, "aarch64_sve_fcvtxnt_f32f64", [IsOverloadNone]>; +def SVCVTXNT_F32 : SInst<"svcvtxnt_f32[_f64]", "MMPd", "d", MergeOp1, "aarch64_sve_fcvtxnt_f32f64", [IsOverloadNone, IsStreamingCompatible]>; // SVCVTXNT_X_F32 : Implemented as macro by SveEmitter.cpp } @@ -1187,9 +969,9 @@ // Permutations and selection multiclass SVEPerm { - def : SInst; + def : SInst; let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in { - def: SInst; + def: SInst; } } @@ -1213,60 +995,60 @@ let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in { def SVDUPQ_LANE_BF16 : SInst<"svdupq_lane[_{d}]", "ddn", "b", MergeNone, "aarch64_sve_dupq_lane">; } -def SVEXT : SInst<"svext[_{d}]", "dddi", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ext", [], [ImmCheck<2, ImmCheckExtract, 1>]>; +def SVEXT : SInst<"svext[_{d}]", "dddi", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ext", [IsStreamingCompatible], [ImmCheck<2, ImmCheckExtract, 1>]>; defm SVLASTA : SVEPerm<"svlasta[_{d}]", "sPd", "aarch64_sve_lasta">; defm SVLASTB : SVEPerm<"svlastb[_{d}]", "sPd", "aarch64_sve_lastb">; -def SVREV : SInst<"svrev[_{d}]", "dd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_rev">; -def SVSEL : SInst<"svsel[_{d}]", "dPdd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_sel">; -def SVSPLICE : SInst<"svsplice[_{d}]", "dPdd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_splice">; -def SVTBL : SInst<"svtbl[_{d}]", "ddu", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_tbl">; +def SVREV : SInst<"svrev[_{d}]", "dd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_rev", [IsStreamingCompatible]>; +def SVSEL : SInst<"svsel[_{d}]", "dPdd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_sel", [IsStreamingCompatible]>; +def SVSPLICE : SInst<"svsplice[_{d}]", "dPdd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_splice", [IsStreamingCompatible]>; +def SVTBL : SInst<"svtbl[_{d}]", "ddu", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_tbl", [IsStreamingCompatible]>; let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in { - def SVTBL_BF16 : SInst<"svtbl[_{d}]", "ddu", "b", MergeNone, "aarch64_sve_tbl">; + def SVTBL_BF16 : SInst<"svtbl[_{d}]", "ddu", "b", MergeNone, "aarch64_sve_tbl", [IsStreamingCompatible]>; } -def SVTRN1 : SInst<"svtrn1[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn1">; -def SVTRN2 : SInst<"svtrn2[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn2">; -def SVUNPKHI_S : SInst<"svunpkhi[_{d}]", "dh", "sil", MergeNone, "aarch64_sve_sunpkhi">; -def SVUNPKHI_U : SInst<"svunpkhi[_{d}]", "dh", "UsUiUl", MergeNone, "aarch64_sve_uunpkhi">; -def SVUNPKLO_S : SInst<"svunpklo[_{d}]", "dh", "sil", MergeNone, "aarch64_sve_sunpklo">; -def SVUNPKLO_U : SInst<"svunpklo[_{d}]", "dh", "UsUiUl", MergeNone, "aarch64_sve_uunpklo">; -def SVUZP1 : SInst<"svuzp1[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp1">; -def SVUZP2 : SInst<"svuzp2[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp2">; -def SVZIP1 : SInst<"svzip1[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip1">; -def SVZIP2 : SInst<"svzip2[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip2">; +def SVTRN1 : SInst<"svtrn1[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn1", [IsStreamingCompatible]>; +def SVTRN2 : SInst<"svtrn2[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn2", [IsStreamingCompatible]>; +def SVUNPKHI_S : SInst<"svunpkhi[_{d}]", "dh", "sil", MergeNone, "aarch64_sve_sunpkhi", [IsStreamingCompatible]>; +def SVUNPKHI_U : SInst<"svunpkhi[_{d}]", "dh", "UsUiUl", MergeNone, "aarch64_sve_uunpkhi", [IsStreamingCompatible]>; +def SVUNPKLO_S : SInst<"svunpklo[_{d}]", "dh", "sil", MergeNone, "aarch64_sve_sunpklo", [IsStreamingCompatible]>; +def SVUNPKLO_U : SInst<"svunpklo[_{d}]", "dh", "UsUiUl", MergeNone, "aarch64_sve_uunpklo", [IsStreamingCompatible]>; +def SVUZP1 : SInst<"svuzp1[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp1", [IsStreamingCompatible]>; +def SVUZP2 : SInst<"svuzp2[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp2", [IsStreamingCompatible]>; +def SVZIP1 : SInst<"svzip1[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip1", [IsStreamingCompatible]>; +def SVZIP2 : SInst<"svzip2[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip2", [IsStreamingCompatible]>; let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in { -def SVEXT_BF16 : SInst<"svext[_{d}]", "dddi", "b", MergeNone, "aarch64_sve_ext", [], [ImmCheck<2, ImmCheckExtract, 1>]>; -def SVREV_BF16 : SInst<"svrev[_{d}]", "dd", "b", MergeNone, "aarch64_sve_rev">; -def SVSEL_BF16 : SInst<"svsel[_{d}]", "dPdd", "b", MergeNone, "aarch64_sve_sel">; -def SVSPLICE_BF16 : SInst<"svsplice[_{d}]", "dPdd", "b", MergeNone, "aarch64_sve_splice">; -def SVTRN1_BF16 : SInst<"svtrn1[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_trn1">; -def SVTRN2_BF16 : SInst<"svtrn2[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_trn2">; -def SVUZP1_BF16 : SInst<"svuzp1[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_uzp1">; -def SVUZP2_BF16 : SInst<"svuzp2[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_uzp2">; -def SVZIP1_BF16 : SInst<"svzip1[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_zip1">; -def SVZIP2_BF16 : SInst<"svzip2[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_zip2">; -} - -def SVREV_B : SInst<"svrev_{d}", "PP", "PcPsPiPl", MergeNone, "aarch64_sve_rev">; -def SVSEL_B : SInst<"svsel[_b]", "PPPP", "Pc", MergeNone, "aarch64_sve_sel">; -def SVTRN1_B : SInst<"svtrn1_{d}", "PPP", "PcPsPiPl", MergeNone, "aarch64_sve_trn1">; -def SVTRN2_B : SInst<"svtrn2_{d}", "PPP", "PcPsPiPl", MergeNone, "aarch64_sve_trn2">; -def SVPUNPKHI : SInst<"svunpkhi[_b]", "PP", "Pc", MergeNone, "aarch64_sve_punpkhi">; -def SVPUNPKLO : SInst<"svunpklo[_b]", "PP", "Pc", MergeNone, "aarch64_sve_punpklo">; -def SVUZP1_B : SInst<"svuzp1_{d}", "PPP", "PcPsPiPl", MergeNone, "aarch64_sve_uzp1">; -def SVUZP2_B : SInst<"svuzp2_{d}", "PPP", "PcPsPiPl", MergeNone, "aarch64_sve_uzp2">; -def SVZIP1_B : SInst<"svzip1_{d}", "PPP", "PcPsPiPl", MergeNone, "aarch64_sve_zip1">; -def SVZIP2_B : SInst<"svzip2_{d}", "PPP", "PcPsPiPl", MergeNone, "aarch64_sve_zip2">; +def SVEXT_BF16 : SInst<"svext[_{d}]", "dddi", "b", MergeNone, "aarch64_sve_ext", [IsStreamingCompatible], [ImmCheck<2, ImmCheckExtract, 1>]>; +def SVREV_BF16 : SInst<"svrev[_{d}]", "dd", "b", MergeNone, "aarch64_sve_rev", [IsStreamingCompatible]>; +def SVSEL_BF16 : SInst<"svsel[_{d}]", "dPdd", "b", MergeNone, "aarch64_sve_sel", [IsStreamingCompatible]>; +def SVSPLICE_BF16 : SInst<"svsplice[_{d}]", "dPdd", "b", MergeNone, "aarch64_sve_splice", [IsStreamingCompatible]>; +def SVTRN1_BF16 : SInst<"svtrn1[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_trn1", [IsStreamingCompatible]>; +def SVTRN2_BF16 : SInst<"svtrn2[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_trn2", [IsStreamingCompatible]>; +def SVUZP1_BF16 : SInst<"svuzp1[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_uzp1", [IsStreamingCompatible]>; +def SVUZP2_BF16 : SInst<"svuzp2[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_uzp2", [IsStreamingCompatible]>; +def SVZIP1_BF16 : SInst<"svzip1[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_zip1", [IsStreamingCompatible]>; +def SVZIP2_BF16 : SInst<"svzip2[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_zip2", [IsStreamingCompatible]>; +} + +def SVREV_B : SInst<"svrev_{d}", "PP", "PcPsPiPl", MergeNone, "aarch64_sve_rev", [IsStreamingCompatible]>; +def SVSEL_B : SInst<"svsel[_b]", "PPPP", "Pc", MergeNone, "aarch64_sve_sel", [IsStreamingCompatible]>; +def SVTRN1_B : SInst<"svtrn1_{d}", "PPP", "PcPsPiPl", MergeNone, "aarch64_sve_trn1", [IsStreamingCompatible]>; +def SVTRN2_B : SInst<"svtrn2_{d}", "PPP", "PcPsPiPl", MergeNone, "aarch64_sve_trn2", [IsStreamingCompatible]>; +def SVPUNPKHI : SInst<"svunpkhi[_b]", "PP", "Pc", MergeNone, "aarch64_sve_punpkhi", [IsStreamingCompatible]>; +def SVPUNPKLO : SInst<"svunpklo[_b]", "PP", "Pc", MergeNone, "aarch64_sve_punpklo", [IsStreamingCompatible]>; +def SVUZP1_B : SInst<"svuzp1_{d}", "PPP", "PcPsPiPl", MergeNone, "aarch64_sve_uzp1", [IsStreamingCompatible]>; +def SVUZP2_B : SInst<"svuzp2_{d}", "PPP", "PcPsPiPl", MergeNone, "aarch64_sve_uzp2", [IsStreamingCompatible]>; +def SVZIP1_B : SInst<"svzip1_{d}", "PPP", "PcPsPiPl", MergeNone, "aarch64_sve_zip1", [IsStreamingCompatible]>; +def SVZIP2_B : SInst<"svzip2_{d}", "PPP", "PcPsPiPl", MergeNone, "aarch64_sve_zip2", [IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // Predicate creation -def SVPFALSE : SInst<"svpfalse[_b]", "Pv", "", MergeNone, "", [IsOverloadNone]>; +def SVPFALSE : SInst<"svpfalse[_b]", "Pv", "", MergeNone, "", [IsOverloadNone, IsStreamingCompatible]>; -def SVPTRUE_PAT : SInst<"svptrue_pat_{d}", "PI", "PcPsPiPl", MergeNone, "aarch64_sve_ptrue">; -def SVPTRUE : SInst<"svptrue_{d}", "Pv", "PcPsPiPl", MergeNone, "aarch64_sve_ptrue", [IsAppendSVALL]>; +def SVPTRUE_PAT : SInst<"svptrue_pat_{d}", "PI", "PcPsPiPl", MergeNone, "aarch64_sve_ptrue", [IsStreamingCompatible]>; +def SVPTRUE : SInst<"svptrue_{d}", "Pv", "PcPsPiPl", MergeNone, "aarch64_sve_ptrue", [IsStreamingCompatible, IsAppendSVALL]>; def SVDUPQ_B8 : SInst<"svdupq[_n]_{d}", "Pssssssssssssssss", "Pc", MergeNone>; def SVDUPQ_B16 : SInst<"svdupq[_n]_{d}", "Pssssssss", "Ps", MergeNone>; @@ -1278,33 +1060,33 @@ //////////////////////////////////////////////////////////////////////////////// // Predicate operations -def SVAND_B_Z : SInst<"svand[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_and_z">; -def SVBIC_B_Z : SInst<"svbic[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_bic_z">; -def SVEOR_B_Z : SInst<"sveor[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_eor_z">; -def SVMOV_B_Z : SInst<"svmov[_b]_z", "PPP", "Pc", MergeNone>; // Uses custom expansion -def SVNAND_B_Z : SInst<"svnand[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_nand_z">; -def SVNOR_B_Z : SInst<"svnor[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_nor_z">; -def SVNOT_B_Z : SInst<"svnot[_b]_z", "PPP", "Pc", MergeNone>; // Uses custom expansion -def SVORN_B_Z : SInst<"svorn[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_orn_z">; -def SVORR_B_Z : SInst<"svorr[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_orr_z">; - -def SVBRKA : SInst<"svbrka[_b]_m", "PPPP", "Pc", MergeNone, "aarch64_sve_brka">; -def SVBRKA_Z : SInst<"svbrka[_b]_z", "PPP", "Pc", MergeNone, "aarch64_sve_brka_z">; -def SVBRKB : SInst<"svbrkb[_b]_m", "PPPP", "Pc", MergeNone, "aarch64_sve_brkb">; -def SVBRKB_Z : SInst<"svbrkb[_b]_z", "PPP", "Pc", MergeNone, "aarch64_sve_brkb_z">; -def SVBRKN_Z : SInst<"svbrkn[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_brkn_z">; -def SVBRKPA_Z : SInst<"svbrkpa[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_brkpa_z">; -def SVBRKPB_Z : SInst<"svbrkpb[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_brkpb_z">; - -def SVPFIRST : SInst<"svpfirst[_b]", "PPP", "Pc", MergeNone, "aarch64_sve_pfirst">; -def SVPNEXT : SInst<"svpnext_{d}", "PPP", "PcPsPiPl", MergeNone, "aarch64_sve_pnext">; +def SVAND_B_Z : SInst<"svand[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_and_z", [IsStreamingCompatible]>; +def SVBIC_B_Z : SInst<"svbic[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_bic_z", [IsStreamingCompatible]>; +def SVEOR_B_Z : SInst<"sveor[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_eor_z", [IsStreamingCompatible]>; +def SVMOV_B_Z : SInst<"svmov[_b]_z", "PPP", "Pc", MergeNone, "", [IsStreamingCompatible]>; // Uses custom expansion +def SVNAND_B_Z : SInst<"svnand[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_nand_z", [IsStreamingCompatible]>; +def SVNOR_B_Z : SInst<"svnor[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_nor_z", [IsStreamingCompatible]>; +def SVNOT_B_Z : SInst<"svnot[_b]_z", "PPP", "Pc", MergeNone, "", [IsStreamingCompatible]>; // Uses custom expansion +def SVORN_B_Z : SInst<"svorn[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_orn_z", [IsStreamingCompatible]>; +def SVORR_B_Z : SInst<"svorr[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_orr_z", [IsStreamingCompatible]>; + +def SVBRKA : SInst<"svbrka[_b]_m", "PPPP", "Pc", MergeNone, "aarch64_sve_brka", [IsStreamingCompatible]>; +def SVBRKA_Z : SInst<"svbrka[_b]_z", "PPP", "Pc", MergeNone, "aarch64_sve_brka_z", [IsStreamingCompatible]>; +def SVBRKB : SInst<"svbrkb[_b]_m", "PPPP", "Pc", MergeNone, "aarch64_sve_brkb", [IsStreamingCompatible]>; +def SVBRKB_Z : SInst<"svbrkb[_b]_z", "PPP", "Pc", MergeNone, "aarch64_sve_brkb_z", [IsStreamingCompatible]>; +def SVBRKN_Z : SInst<"svbrkn[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_brkn_z", [IsStreamingCompatible]>; +def SVBRKPA_Z : SInst<"svbrkpa[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_brkpa_z", [IsStreamingCompatible]>; +def SVBRKPB_Z : SInst<"svbrkpb[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_brkpb_z", [IsStreamingCompatible]>; + +def SVPFIRST : SInst<"svpfirst[_b]", "PPP", "Pc", MergeNone, "aarch64_sve_pfirst", [IsStreamingCompatible]>; +def SVPNEXT : SInst<"svpnext_{d}", "PPP", "PcPsPiPl", MergeNone, "aarch64_sve_pnext", [IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // Testing predicates -def SVPTEST_ANY : SInst<"svptest_any", "sPP", "Pc", MergeNone, "aarch64_sve_ptest_any">; -def SVPTEST_FIRST : SInst<"svptest_first", "sPP", "Pc", MergeNone, "aarch64_sve_ptest_first">; -def SVPTEST_LAST : SInst<"svptest_last", "sPP", "Pc", MergeNone, "aarch64_sve_ptest_last">; +def SVPTEST_ANY : SInst<"svptest_any", "sPP", "Pc", MergeNone, "aarch64_sve_ptest_any", [IsStreamingCompatible]>; +def SVPTEST_FIRST : SInst<"svptest_first", "sPP", "Pc", MergeNone, "aarch64_sve_ptest_first", [IsStreamingCompatible]>; +def SVPTEST_LAST : SInst<"svptest_last", "sPP", "Pc", MergeNone, "aarch64_sve_ptest_last", [IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // FFR manipulation @@ -1317,21 +1099,21 @@ //////////////////////////////////////////////////////////////////////////////// // Counting elements -def SVCNTB_PAT : SInst<"svcntb_pat", "nI", "", MergeNone, "aarch64_sve_cntb", [IsOverloadNone]>; -def SVCNTH_PAT : SInst<"svcnth_pat", "nI", "", MergeNone, "aarch64_sve_cnth", [IsOverloadNone]>; -def SVCNTW_PAT : SInst<"svcntw_pat", "nI", "", MergeNone, "aarch64_sve_cntw", [IsOverloadNone]>; -def SVCNTD_PAT : SInst<"svcntd_pat", "nI", "", MergeNone, "aarch64_sve_cntd", [IsOverloadNone]>; +def SVCNTB_PAT : SInst<"svcntb_pat", "nI", "", MergeNone, "aarch64_sve_cntb", [IsOverloadNone, IsStreamingCompatible]>; +def SVCNTH_PAT : SInst<"svcnth_pat", "nI", "", MergeNone, "aarch64_sve_cnth", [IsOverloadNone, IsStreamingCompatible]>; +def SVCNTW_PAT : SInst<"svcntw_pat", "nI", "", MergeNone, "aarch64_sve_cntw", [IsOverloadNone, IsStreamingCompatible]>; +def SVCNTD_PAT : SInst<"svcntd_pat", "nI", "", MergeNone, "aarch64_sve_cntd", [IsOverloadNone, IsStreamingCompatible]>; -def SVCNTB : SInst<"svcntb", "nv", "", MergeNone, "aarch64_sve_cntb", [IsAppendSVALL, IsOverloadNone]>; -def SVCNTH : SInst<"svcnth", "nv", "", MergeNone, "aarch64_sve_cnth", [IsAppendSVALL, IsOverloadNone]>; -def SVCNTW : SInst<"svcntw", "nv", "", MergeNone, "aarch64_sve_cntw", [IsAppendSVALL, IsOverloadNone]>; -def SVCNTD : SInst<"svcntd", "nv", "", MergeNone, "aarch64_sve_cntd", [IsAppendSVALL, IsOverloadNone]>; +def SVCNTB : SInst<"svcntb", "nv", "", MergeNone, "aarch64_sve_cntb", [IsAppendSVALL, IsOverloadNone, IsStreamingCompatible]>; +def SVCNTH : SInst<"svcnth", "nv", "", MergeNone, "aarch64_sve_cnth", [IsAppendSVALL, IsOverloadNone, IsStreamingCompatible]>; +def SVCNTW : SInst<"svcntw", "nv", "", MergeNone, "aarch64_sve_cntw", [IsAppendSVALL, IsOverloadNone, IsStreamingCompatible]>; +def SVCNTD : SInst<"svcntd", "nv", "", MergeNone, "aarch64_sve_cntd", [IsAppendSVALL, IsOverloadNone, IsStreamingCompatible]>; -def SVCNTP : SInst<"svcntp_{d}", "nPP", "PcPsPiPl", MergeNone, "aarch64_sve_cntp">; -def SVLEN : SInst<"svlen[_{d}]", "nd", "csilUcUsUiUlhfd", MergeNone>; +def SVCNTP : SInst<"svcntp_{d}", "nPP", "PcPsPiPl", MergeNone, "aarch64_sve_cntp", [IsStreamingCompatible]>; +def SVLEN : SInst<"svlen[_{d}]", "nd", "csilUcUsUiUlhfd", MergeNone, "", [IsStreamingCompatible]>; let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in { -def SVLEN_BF16 : SInst<"svlen[_{d}]", "nd", "b", MergeNone>; +def SVLEN_BF16 : SInst<"svlen[_{d}]", "nd", "b", MergeNone, "", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// @@ -1348,20 +1130,20 @@ def UnsignedDoubleWord : sat_type<"U", "Ul">; multiclass SInst_SAT1 { - def _N32 : SInst]>; - def _N64 : SInst]>; - def _N32_ALL : SInst]>; - def _N64_ALL : SInst]>; + def _N32 : SInst]>; + def _N64 : SInst]>; + def _N32_ALL : SInst]>; + def _N64_ALL : SInst]>; } multiclass SInst_SAT2 { - def "" : SInst]>; - def _ALL : SInst]>; + def "" : SInst]>; + def _ALL : SInst]>; - def _N32 : SInst]>; - def _N64 : SInst]>; - def _N32_ALL : SInst]>; - def _N64_ALL : SInst]>; + def _N32 : SInst]>; + def _N64 : SInst]>; + def _N32_ALL : SInst]>; + def _N64_ALL : SInst]>; } defm SVQDECB_S : SInst_SAT1<"svqdecb", "aarch64_sve_sqdecb", SignedByte>; @@ -1382,32 +1164,32 @@ defm SVQINCD_S : SInst_SAT2<"svqincd", "aarch64_sve_sqincd", SignedDoubleWord>; defm SVQINCD_U : SInst_SAT2<"svqincd", "aarch64_sve_uqincd", UnsignedDoubleWord>; -def SVQDECP_S : SInst<"svqdecp[_{d}]", "ddP", "sil", MergeNone, "aarch64_sve_sqdecp">; -def SVQDECP_U : SInst<"svqdecp[_{d}]", "ddP", "UsUiUl", MergeNone, "aarch64_sve_uqdecp">; -def SVQINCP_S : SInst<"svqincp[_{d}]", "ddP", "sil", MergeNone, "aarch64_sve_sqincp">; -def SVQINCP_U : SInst<"svqincp[_{d}]", "ddP", "UsUiUl", MergeNone, "aarch64_sve_uqincp">; +def SVQDECP_S : SInst<"svqdecp[_{d}]", "ddP", "sil", MergeNone, "aarch64_sve_sqdecp", [IsStreamingCompatible]>; +def SVQDECP_U : SInst<"svqdecp[_{d}]", "ddP", "UsUiUl", MergeNone, "aarch64_sve_uqdecp", [IsStreamingCompatible]>; +def SVQINCP_S : SInst<"svqincp[_{d}]", "ddP", "sil", MergeNone, "aarch64_sve_sqincp", [IsStreamingCompatible]>; +def SVQINCP_U : SInst<"svqincp[_{d}]", "ddP", "UsUiUl", MergeNone, "aarch64_sve_uqincp", [IsStreamingCompatible]>; -def SVQDECP_N_S32 : SInst<"svqdecp[_n_s32]_{d}", "kkP", "PcPsPiPl", MergeNone, "aarch64_sve_sqdecp_n32">; -def SVQDECP_N_S64 : SInst<"svqdecp[_n_s64]_{d}", "llP", "PcPsPiPl", MergeNone, "aarch64_sve_sqdecp_n64">; -def SVQDECP_N_U32 : SInst<"svqdecp[_n_u32]_{d}", "mmP", "PcPsPiPl", MergeNone, "aarch64_sve_uqdecp_n32">; -def SVQDECP_N_U64 : SInst<"svqdecp[_n_u64]_{d}", "nnP", "PcPsPiPl", MergeNone, "aarch64_sve_uqdecp_n64">; -def SVQINCP_N_S32 : SInst<"svqincp[_n_s32]_{d}", "kkP", "PcPsPiPl", MergeNone, "aarch64_sve_sqincp_n32">; -def SVQINCP_N_S64 : SInst<"svqincp[_n_s64]_{d}", "llP", "PcPsPiPl", MergeNone, "aarch64_sve_sqincp_n64">; -def SVQINCP_N_U32 : SInst<"svqincp[_n_u32]_{d}", "mmP", "PcPsPiPl", MergeNone, "aarch64_sve_uqincp_n32">; -def SVQINCP_N_U64 : SInst<"svqincp[_n_u64]_{d}", "nnP", "PcPsPiPl", MergeNone, "aarch64_sve_uqincp_n64">; +def SVQDECP_N_S32 : SInst<"svqdecp[_n_s32]_{d}", "kkP", "PcPsPiPl", MergeNone, "aarch64_sve_sqdecp_n32", [IsStreamingCompatible]>; +def SVQDECP_N_S64 : SInst<"svqdecp[_n_s64]_{d}", "llP", "PcPsPiPl", MergeNone, "aarch64_sve_sqdecp_n64", [IsStreamingCompatible]>; +def SVQDECP_N_U32 : SInst<"svqdecp[_n_u32]_{d}", "mmP", "PcPsPiPl", MergeNone, "aarch64_sve_uqdecp_n32", [IsStreamingCompatible]>; +def SVQDECP_N_U64 : SInst<"svqdecp[_n_u64]_{d}", "nnP", "PcPsPiPl", MergeNone, "aarch64_sve_uqdecp_n64", [IsStreamingCompatible]>; +def SVQINCP_N_S32 : SInst<"svqincp[_n_s32]_{d}", "kkP", "PcPsPiPl", MergeNone, "aarch64_sve_sqincp_n32", [IsStreamingCompatible]>; +def SVQINCP_N_S64 : SInst<"svqincp[_n_s64]_{d}", "llP", "PcPsPiPl", MergeNone, "aarch64_sve_sqincp_n64", [IsStreamingCompatible]>; +def SVQINCP_N_U32 : SInst<"svqincp[_n_u32]_{d}", "mmP", "PcPsPiPl", MergeNone, "aarch64_sve_uqincp_n32", [IsStreamingCompatible]>; +def SVQINCP_N_U64 : SInst<"svqincp[_n_u64]_{d}", "nnP", "PcPsPiPl", MergeNone, "aarch64_sve_uqincp_n64", [IsStreamingCompatible]>; let ArchGuard = "defined(__ARM_FEATURE_SVE_MATMUL_INT8)" in { def SVMLLA_S32 : SInst<"svmmla[_s32]", "ddqq","i", MergeNone, "aarch64_sve_smmla">; def SVMLLA_U32 : SInst<"svmmla[_u32]", "ddqq","Ui", MergeNone, "aarch64_sve_ummla">; def SVUSMLLA_S32 : SInst<"svusmmla[_s32]", "ddbq","i", MergeNone, "aarch64_sve_usmmla">; -def SVUSDOT_S : SInst<"svusdot[_s32]", "ddbq", "i", MergeNone, "aarch64_sve_usdot">; -def SVUSDOT_N_S : SInst<"svusdot[_n_s32]", "ddbr", "i", MergeNone, "aarch64_sve_usdot">; -def SVSUDOT_S : SInst<"svsudot[_s32]", "ddqb", "i", MergeNone, "aarch64_sve_usdot", [ReverseUSDOT]>; -def SVSUDOT_N_S : SInst<"svsudot[_n_s32]", "ddq@", "i", MergeNone, "aarch64_sve_usdot", [ReverseUSDOT]>; +def SVUSDOT_S : SInst<"svusdot[_s32]", "ddbq", "i", MergeNone, "aarch64_sve_usdot", [IsStreamingCompatible]>; +def SVUSDOT_N_S : SInst<"svusdot[_n_s32]", "ddbr", "i", MergeNone, "aarch64_sve_usdot", [IsStreamingCompatible]>; +def SVSUDOT_S : SInst<"svsudot[_s32]", "ddqb", "i", MergeNone, "aarch64_sve_usdot", [ReverseUSDOT, IsStreamingCompatible]>; +def SVSUDOT_N_S : SInst<"svsudot[_n_s32]", "ddq@", "i", MergeNone, "aarch64_sve_usdot", [ReverseUSDOT, IsStreamingCompatible]>; -def SVUSDOT_LANE_S : SInst<"svusdot_lane[_s32]", "ddbqi", "i", MergeNone, "aarch64_sve_usdot_lane", [], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>; -def SVSUDOT_LANE_S : SInst<"svsudot_lane[_s32]", "ddqbi", "i", MergeNone, "aarch64_sve_sudot_lane", [], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>; +def SVUSDOT_LANE_S : SInst<"svusdot_lane[_s32]", "ddbqi", "i", MergeNone, "aarch64_sve_usdot_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>; +def SVSUDOT_LANE_S : SInst<"svsudot_lane[_s32]", "ddqbi", "i", MergeNone, "aarch64_sve_sudot_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>; } let ArchGuard = "defined(__ARM_FEATURE_SVE_MATMUL_FP32)" in { @@ -1416,12 +1198,12 @@ let ArchGuard = "defined(__ARM_FEATURE_SVE_MATMUL_FP64)" in { def SVMLLA_F64 : SInst<"svmmla[_f64]", "dddd","d", MergeNone, "aarch64_sve_fmmla">; -def SVTRN1Q : SInst<"svtrn1q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn1q">; -def SVTRN2Q : SInst<"svtrn2q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn2q">; -def SVUZP1Q : SInst<"svuzp1q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp1q">; -def SVUZP2Q : SInst<"svuzp2q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp2q">; -def SVZIP1Q : SInst<"svzip1q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip1q">; -def SVZIP2Q : SInst<"svzip2q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip2q">; +def SVTRN1Q : SInst<"svtrn1q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn1q", [IsStreamingCompatible]>; +def SVTRN2Q : SInst<"svtrn2q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn2q", [IsStreamingCompatible]>; +def SVUZP1Q : SInst<"svuzp1q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp1q", [IsStreamingCompatible]>; +def SVUZP2Q : SInst<"svuzp2q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp2q", [IsStreamingCompatible]>; +def SVZIP1Q : SInst<"svzip1q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip1q", [IsStreamingCompatible]>; +def SVZIP2Q : SInst<"svzip2q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip2q", [IsStreamingCompatible]>; } let ArchGuard = "defined(__ARM_FEATURE_SVE_MATMUL_FP64) && defined(__ARM_FEATURE_SVE_BF16)" in { @@ -1478,14 +1260,14 @@ //////////////////////////////////////////////////////////////////////////////// // SVE2 WhileGE/GT let ArchGuard = "defined(__ARM_FEATURE_SVE2)" in { -def SVWHILEGE_S32 : SInst<"svwhilege_{d}[_{1}]", "Pkk", "PcPsPiPl", MergeNone, "aarch64_sve_whilege", [IsOverloadWhile]>; -def SVWHILEGE_S64 : SInst<"svwhilege_{d}[_{1}]", "Pll", "PcPsPiPl", MergeNone, "aarch64_sve_whilege", [IsOverloadWhile]>; -def SVWHILEGT_S32 : SInst<"svwhilegt_{d}[_{1}]", "Pkk", "PcPsPiPl", MergeNone, "aarch64_sve_whilegt", [IsOverloadWhile]>; -def SVWHILEGT_S64 : SInst<"svwhilegt_{d}[_{1}]", "Pll", "PcPsPiPl", MergeNone, "aarch64_sve_whilegt", [IsOverloadWhile]>; -def SVWHILEHI_U32 : SInst<"svwhilegt_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehi", [IsOverloadWhile]>; -def SVWHILEHI_U64 : SInst<"svwhilegt_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehi", [IsOverloadWhile]>; -def SVWHILEHS_U32 : SInst<"svwhilege_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehs", [IsOverloadWhile]>; -def SVWHILEHS_U64 : SInst<"svwhilege_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehs", [IsOverloadWhile]>; +def SVWHILEGE_S32 : SInst<"svwhilege_{d}[_{1}]", "Pkk", "PcPsPiPl", MergeNone, "aarch64_sve_whilege", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILEGE_S64 : SInst<"svwhilege_{d}[_{1}]", "Pll", "PcPsPiPl", MergeNone, "aarch64_sve_whilege", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILEGT_S32 : SInst<"svwhilegt_{d}[_{1}]", "Pkk", "PcPsPiPl", MergeNone, "aarch64_sve_whilegt", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILEGT_S64 : SInst<"svwhilegt_{d}[_{1}]", "Pll", "PcPsPiPl", MergeNone, "aarch64_sve_whilegt", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILEHI_U32 : SInst<"svwhilegt_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehi", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILEHI_U64 : SInst<"svwhilegt_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehi", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILEHS_U32 : SInst<"svwhilege_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehs", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILEHS_U64 : SInst<"svwhilege_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehs", [IsOverloadWhile, IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// @@ -1527,49 +1309,49 @@ } let ArchGuard = "defined(__ARM_FEATURE_SVE2)" in { -defm SVQRSHL_S : SInstZPZxZ<"svqrshl", "csil", "dPdx", "dPdK", "aarch64_sve_sqrshl">; -defm SVQRSHL_U : SInstZPZxZ<"svqrshl", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_uqrshl">; -defm SVQSHL_S : SInstZPZxZ<"svqshl", "csil", "dPdx", "dPdK", "aarch64_sve_sqshl">; -defm SVQSHL_U : SInstZPZxZ<"svqshl", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_uqshl">; -defm SVRSHL_S : SInstZPZxZ<"svrshl", "csil", "dPdx", "dPdK", "aarch64_sve_srshl">; -defm SVRSHL_U : SInstZPZxZ<"svrshl", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_urshl">; -defm SVSQADD : SInstZPZxZ<"svsqadd", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_usqadd">; -defm SVUQADD : SInstZPZxZ<"svuqadd", "csil", "dPdu", "dPdL", "aarch64_sve_suqadd">; - -def SVABA_S : SInst<"svaba[_{d}]", "dddd", "csil" , MergeNone, "aarch64_sve_saba">; -def SVABA_U : SInst<"svaba[_{d}]", "dddd", "UcUsUiUl", MergeNone, "aarch64_sve_uaba">; -def SVQDMULH : SInst<"svqdmulh[_{d}]", "ddd", "csil", MergeNone, "aarch64_sve_sqdmulh">; -def SVQRDMULH : SInst<"svqrdmulh[_{d}]", "ddd", "csil", MergeNone, "aarch64_sve_sqrdmulh">; -def SVQRDMLAH : SInst<"svqrdmlah[_{d}]", "dddd", "csil", MergeNone, "aarch64_sve_sqrdmlah">; -def SVQRDMLSH : SInst<"svqrdmlsh[_{d}]", "dddd", "csil", MergeNone, "aarch64_sve_sqrdmlsh">; - -def SVABA_S_N : SInst<"svaba[_n_{d}]", "ddda", "csil", MergeNone, "aarch64_sve_saba">; -def SVABA_U_N : SInst<"svaba[_n_{d}]", "ddda", "UcUsUiUl", MergeNone, "aarch64_sve_uaba">; -def SVQDMULH_N : SInst<"svqdmulh[_n_{d}]", "dda", "csil", MergeNone, "aarch64_sve_sqdmulh">; -def SVQRDMULH_N : SInst<"svqrdmulh[_n_{d}]", "dda", "csil", MergeNone, "aarch64_sve_sqrdmulh">; -def SVQRDMLAH_N : SInst<"svqrdmlah[_n_{d}]", "ddda", "csil", MergeNone, "aarch64_sve_sqrdmlah">; -def SVQRDMLSH_N : SInst<"svqrdmlsh[_n_{d}]", "ddda", "csil", MergeNone, "aarch64_sve_sqrdmlsh">; - -def SVQDMULH_LANE : SInst<"svqdmulh_lane[_{d}]", "dddi", "sil", MergeNone, "aarch64_sve_sqdmulh_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; -def SVQRDMULH_LANE : SInst<"svqrdmulh_lane[_{d}]", "dddi", "sil", MergeNone, "aarch64_sve_sqrdmulh_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; -def SVQRDMLAH_LANE : SInst<"svqrdmlah_lane[_{d}]", "ddddi", "sil", MergeNone, "aarch64_sve_sqrdmlah_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVQRDMLSH_LANE : SInst<"svqrdmlsh_lane[_{d}]", "ddddi", "sil", MergeNone, "aarch64_sve_sqrdmlsh_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; - -def SVQSHLU_M : SInst<"svqshlu[_n_{d}]", "uPdi", "csil", MergeOp1, "aarch64_sve_sqshlu", [], [ImmCheck<2, ImmCheckShiftLeft, 1>]>; -def SVQSHLU_X : SInst<"svqshlu[_n_{d}]", "uPdi", "csil", MergeAny, "aarch64_sve_sqshlu", [], [ImmCheck<2, ImmCheckShiftLeft, 1>]>; -def SVQSHLU_Z : SInst<"svqshlu[_n_{d}]", "uPdi", "csil", MergeZero, "aarch64_sve_sqshlu", [], [ImmCheck<2, ImmCheckShiftLeft, 1>]>; -def SVRSHR_M_S : SInst<"svrshr[_n_{d}]", "dPdi", "csil", MergeOp1, "aarch64_sve_srshr", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVRSHR_M_U : SInst<"svrshr[_n_{d}]", "dPdi", "UcUsUiUl", MergeOp1, "aarch64_sve_urshr", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVRSHR_X_S : SInst<"svrshr[_n_{d}]", "dPdi", "csil", MergeAny, "aarch64_sve_srshr", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVRSHR_X_U : SInst<"svrshr[_n_{d}]", "dPdi", "UcUsUiUl", MergeAny, "aarch64_sve_urshr", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVRSHR_Z_S : SInst<"svrshr[_n_{d}]", "dPdi", "csil", MergeZero, "aarch64_sve_srshr", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVRSHR_Z_U : SInst<"svrshr[_n_{d}]", "dPdi", "UcUsUiUl", MergeZero, "aarch64_sve_urshr", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVRSRA_S : SInst<"svrsra[_n_{d}]", "dddi", "csil", MergeNone, "aarch64_sve_srsra", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVRSRA_U : SInst<"svrsra[_n_{d}]", "dddi", "UcUsUiUl", MergeNone, "aarch64_sve_ursra", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVSLI : SInst<"svsli[_n_{d}]", "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_sli", [], [ImmCheck<2, ImmCheckShiftLeft, 1>]>; -def SVSRA_S : SInst<"svsra[_n_{d}]", "dddi", "csil", MergeNone, "aarch64_sve_ssra", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVSRA_U : SInst<"svsra[_n_{d}]", "dddi", "UcUsUiUl", MergeNone, "aarch64_sve_usra", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVSRI : SInst<"svsri[_n_{d}]", "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_sri", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +defm SVQRSHL_S : SInstZPZxZ<"svqrshl", "csil", "dPdx", "dPdK", "aarch64_sve_sqrshl", [IsStreamingCompatible]>; +defm SVQRSHL_U : SInstZPZxZ<"svqrshl", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_uqrshl", [IsStreamingCompatible]>; +defm SVQSHL_S : SInstZPZxZ<"svqshl", "csil", "dPdx", "dPdK", "aarch64_sve_sqshl", [IsStreamingCompatible]>; +defm SVQSHL_U : SInstZPZxZ<"svqshl", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_uqshl", [IsStreamingCompatible]>; +defm SVRSHL_S : SInstZPZxZ<"svrshl", "csil", "dPdx", "dPdK", "aarch64_sve_srshl", [IsStreamingCompatible]>; +defm SVRSHL_U : SInstZPZxZ<"svrshl", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_urshl", [IsStreamingCompatible]>; +defm SVSQADD : SInstZPZxZ<"svsqadd", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_usqadd", [IsStreamingCompatible]>; +defm SVUQADD : SInstZPZxZ<"svuqadd", "csil", "dPdu", "dPdL", "aarch64_sve_suqadd", [IsStreamingCompatible]>; + +def SVABA_S : SInst<"svaba[_{d}]", "dddd", "csil" , MergeNone, "aarch64_sve_saba", [IsStreamingCompatible]>; +def SVABA_U : SInst<"svaba[_{d}]", "dddd", "UcUsUiUl", MergeNone, "aarch64_sve_uaba", [IsStreamingCompatible]>; +def SVQDMULH : SInst<"svqdmulh[_{d}]", "ddd", "csil", MergeNone, "aarch64_sve_sqdmulh", [IsStreamingCompatible]>; +def SVQRDMULH : SInst<"svqrdmulh[_{d}]", "ddd", "csil", MergeNone, "aarch64_sve_sqrdmulh", [IsStreamingCompatible]>; +def SVQRDMLAH : SInst<"svqrdmlah[_{d}]", "dddd", "csil", MergeNone, "aarch64_sve_sqrdmlah", [IsStreamingCompatible]>; +def SVQRDMLSH : SInst<"svqrdmlsh[_{d}]", "dddd", "csil", MergeNone, "aarch64_sve_sqrdmlsh", [IsStreamingCompatible]>; + +def SVABA_S_N : SInst<"svaba[_n_{d}]", "ddda", "csil", MergeNone, "aarch64_sve_saba", [IsStreamingCompatible]>; +def SVABA_U_N : SInst<"svaba[_n_{d}]", "ddda", "UcUsUiUl", MergeNone, "aarch64_sve_uaba", [IsStreamingCompatible]>; +def SVQDMULH_N : SInst<"svqdmulh[_n_{d}]", "dda", "csil", MergeNone, "aarch64_sve_sqdmulh", [IsStreamingCompatible]>; +def SVQRDMULH_N : SInst<"svqrdmulh[_n_{d}]", "dda", "csil", MergeNone, "aarch64_sve_sqrdmulh", [IsStreamingCompatible]>; +def SVQRDMLAH_N : SInst<"svqrdmlah[_n_{d}]", "ddda", "csil", MergeNone, "aarch64_sve_sqrdmlah", [IsStreamingCompatible]>; +def SVQRDMLSH_N : SInst<"svqrdmlsh[_n_{d}]", "ddda", "csil", MergeNone, "aarch64_sve_sqrdmlsh", [IsStreamingCompatible]>; + +def SVQDMULH_LANE : SInst<"svqdmulh_lane[_{d}]", "dddi", "sil", MergeNone, "aarch64_sve_sqdmulh_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVQRDMULH_LANE : SInst<"svqrdmulh_lane[_{d}]", "dddi", "sil", MergeNone, "aarch64_sve_sqrdmulh_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVQRDMLAH_LANE : SInst<"svqrdmlah_lane[_{d}]", "ddddi", "sil", MergeNone, "aarch64_sve_sqrdmlah_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVQRDMLSH_LANE : SInst<"svqrdmlsh_lane[_{d}]", "ddddi", "sil", MergeNone, "aarch64_sve_sqrdmlsh_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; + +def SVQSHLU_M : SInst<"svqshlu[_n_{d}]", "uPdi", "csil", MergeOp1, "aarch64_sve_sqshlu", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftLeft, 1>]>; +def SVQSHLU_X : SInst<"svqshlu[_n_{d}]", "uPdi", "csil", MergeAny, "aarch64_sve_sqshlu", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftLeft, 1>]>; +def SVQSHLU_Z : SInst<"svqshlu[_n_{d}]", "uPdi", "csil", MergeZero, "aarch64_sve_sqshlu", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftLeft, 1>]>; +def SVRSHR_M_S : SInst<"svrshr[_n_{d}]", "dPdi", "csil", MergeOp1, "aarch64_sve_srshr", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVRSHR_M_U : SInst<"svrshr[_n_{d}]", "dPdi", "UcUsUiUl", MergeOp1, "aarch64_sve_urshr", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVRSHR_X_S : SInst<"svrshr[_n_{d}]", "dPdi", "csil", MergeAny, "aarch64_sve_srshr", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVRSHR_X_U : SInst<"svrshr[_n_{d}]", "dPdi", "UcUsUiUl", MergeAny, "aarch64_sve_urshr", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVRSHR_Z_S : SInst<"svrshr[_n_{d}]", "dPdi", "csil", MergeZero, "aarch64_sve_srshr", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVRSHR_Z_U : SInst<"svrshr[_n_{d}]", "dPdi", "UcUsUiUl", MergeZero, "aarch64_sve_urshr", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVRSRA_S : SInst<"svrsra[_n_{d}]", "dddi", "csil", MergeNone, "aarch64_sve_srsra", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVRSRA_U : SInst<"svrsra[_n_{d}]", "dddi", "UcUsUiUl", MergeNone, "aarch64_sve_ursra", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVSLI : SInst<"svsli[_n_{d}]", "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_sli", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftLeft, 1>]>; +def SVSRA_S : SInst<"svsra[_n_{d}]", "dddi", "csil", MergeNone, "aarch64_sve_ssra", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVSRA_U : SInst<"svsra[_n_{d}]", "dddi", "UcUsUiUl", MergeNone, "aarch64_sve_usra", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVSRI : SInst<"svsri[_n_{d}]", "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_sri", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; } //////////////////////////////////////////////////////////////////////////////// @@ -1581,29 +1363,29 @@ } let ArchGuard = "defined(__ARM_FEATURE_SVE2)" in { -defm SVADDP : SInstPairwise<"svaddp", "csliUcUsUiUl", "aarch64_sve_addp">; -defm SVADDP_F : SInstPairwise<"svaddp", "hfd", "aarch64_sve_faddp">; -defm SVMAXNMP : SInstPairwise<"svmaxnmp", "hfd", "aarch64_sve_fmaxnmp">; -defm SVMAXP_F : SInstPairwise<"svmaxp", "hfd", "aarch64_sve_fmaxp">; -defm SVMAXP_S : SInstPairwise<"svmaxp", "csli", "aarch64_sve_smaxp">; -defm SVMAXP_U : SInstPairwise<"svmaxp", "UcUsUiUl", "aarch64_sve_umaxp">; -defm SVMINNMP : SInstPairwise<"svminnmp", "hfd", "aarch64_sve_fminnmp">; -defm SVMINP_F : SInstPairwise<"svminp", "hfd", "aarch64_sve_fminp">; -defm SVMINP_S : SInstPairwise<"svminp", "csli", "aarch64_sve_sminp">; -defm SVMINP_U : SInstPairwise<"svminp", "UcUsUiUl", "aarch64_sve_uminp">; +defm SVADDP : SInstPairwise<"svaddp", "csliUcUsUiUl", "aarch64_sve_addp", [IsStreamingCompatible]>; +defm SVADDP_F : SInstPairwise<"svaddp", "hfd", "aarch64_sve_faddp", [IsStreamingCompatible]>; +defm SVMAXNMP : SInstPairwise<"svmaxnmp", "hfd", "aarch64_sve_fmaxnmp", [IsStreamingCompatible]>; +defm SVMAXP_F : SInstPairwise<"svmaxp", "hfd", "aarch64_sve_fmaxp", [IsStreamingCompatible]>; +defm SVMAXP_S : SInstPairwise<"svmaxp", "csli", "aarch64_sve_smaxp", [IsStreamingCompatible]>; +defm SVMAXP_U : SInstPairwise<"svmaxp", "UcUsUiUl", "aarch64_sve_umaxp", [IsStreamingCompatible]>; +defm SVMINNMP : SInstPairwise<"svminnmp", "hfd", "aarch64_sve_fminnmp", [IsStreamingCompatible]>; +defm SVMINP_F : SInstPairwise<"svminp", "hfd", "aarch64_sve_fminp", [IsStreamingCompatible]>; +defm SVMINP_S : SInstPairwise<"svminp", "csli", "aarch64_sve_sminp", [IsStreamingCompatible]>; +defm SVMINP_U : SInstPairwise<"svminp", "UcUsUiUl", "aarch64_sve_uminp", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 - Widening pairwise arithmetic let ArchGuard = "defined(__ARM_FEATURE_SVE2)" in { -def SVADALP_S_M : SInst<"svadalp[_{d}]", "dPdh", "sil", MergeOp1, "aarch64_sve_sadalp">; -def SVADALP_S_X : SInst<"svadalp[_{d}]", "dPdh", "sil", MergeAny, "aarch64_sve_sadalp">; -def SVADALP_S_Z : SInst<"svadalp[_{d}]", "dPdh", "sil", MergeZero, "aarch64_sve_sadalp">; +def SVADALP_S_M : SInst<"svadalp[_{d}]", "dPdh", "sil", MergeOp1, "aarch64_sve_sadalp", [IsStreamingCompatible]>; +def SVADALP_S_X : SInst<"svadalp[_{d}]", "dPdh", "sil", MergeAny, "aarch64_sve_sadalp", [IsStreamingCompatible]>; +def SVADALP_S_Z : SInst<"svadalp[_{d}]", "dPdh", "sil", MergeZero, "aarch64_sve_sadalp", [IsStreamingCompatible]>; -def SVADALP_U_M : SInst<"svadalp[_{d}]", "dPdh", "UsUiUl", MergeOp1, "aarch64_sve_uadalp">; -def SVADALP_U_X : SInst<"svadalp[_{d}]", "dPdh", "UsUiUl", MergeAny, "aarch64_sve_uadalp">; -def SVADALP_U_Z : SInst<"svadalp[_{d}]", "dPdh", "UsUiUl", MergeZero, "aarch64_sve_uadalp">; +def SVADALP_U_M : SInst<"svadalp[_{d}]", "dPdh", "UsUiUl", MergeOp1, "aarch64_sve_uadalp", [IsStreamingCompatible]>; +def SVADALP_U_X : SInst<"svadalp[_{d}]", "dPdh", "UsUiUl", MergeAny, "aarch64_sve_uadalp", [IsStreamingCompatible]>; +def SVADALP_U_Z : SInst<"svadalp[_{d}]", "dPdh", "UsUiUl", MergeZero, "aarch64_sve_uadalp", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// @@ -1611,56 +1393,56 @@ // let ArchGuard = "defined(__ARM_FEATURE_SVE2)" in { -def SVBCAX : SInst<"svbcax[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bcax">; -def SVBSL : SInst<"svbsl[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl">; -def SVBSL1N : SInst<"svbsl1n[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl1n">; -def SVBSL2N : SInst<"svbsl2n[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl2n">; -def SVEOR3 : SInst<"sveor3[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eor3">; -def SVNBSL : SInst<"svnbsl[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_nbsl">; - -def SVBCAX_N : SInst<"svbcax[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bcax">; -def SVBSL_N : SInst<"svbsl[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl">; -def SVBSL1N_N : SInst<"svbsl1n[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl1n">; -def SVBSL2N_N : SInst<"svbsl2n[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl2n">; -def SVEOR3_N : SInst<"sveor3[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_eor3">; -def SVNBSL_N : SInst<"svnbsl[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_nbsl">; -def SVXAR_N : SInst<"svxar[_n_{d}]", "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_xar", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVBCAX : SInst<"svbcax[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bcax", [IsStreamingCompatible]>; +def SVBSL : SInst<"svbsl[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl", [IsStreamingCompatible]>; +def SVBSL1N : SInst<"svbsl1n[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl1n", [IsStreamingCompatible]>; +def SVBSL2N : SInst<"svbsl2n[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl2n", [IsStreamingCompatible]>; +def SVEOR3 : SInst<"sveor3[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eor3", [IsStreamingCompatible]>; +def SVNBSL : SInst<"svnbsl[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_nbsl", [IsStreamingCompatible]>; + +def SVBCAX_N : SInst<"svbcax[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bcax", [IsStreamingCompatible]>; +def SVBSL_N : SInst<"svbsl[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl", [IsStreamingCompatible]>; +def SVBSL1N_N : SInst<"svbsl1n[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl1n", [IsStreamingCompatible]>; +def SVBSL2N_N : SInst<"svbsl2n[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl2n", [IsStreamingCompatible]>; +def SVEOR3_N : SInst<"sveor3[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_eor3", [IsStreamingCompatible]>; +def SVNBSL_N : SInst<"svnbsl[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_nbsl", [IsStreamingCompatible]>; +def SVXAR_N : SInst<"svxar[_n_{d}]", "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_xar", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 - Large integer arithmetic let ArchGuard = "defined(__ARM_FEATURE_SVE2)" in { -def SVADCLB : SInst<"svadclb[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_adclb">; -def SVADCLT : SInst<"svadclt[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_adclt">; -def SVSBCLB : SInst<"svsbclb[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_sbclb">; -def SVSBCLT : SInst<"svsbclt[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_sbclt">; +def SVADCLB : SInst<"svadclb[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_adclb", [IsStreamingCompatible]>; +def SVADCLT : SInst<"svadclt[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_adclt", [IsStreamingCompatible]>; +def SVSBCLB : SInst<"svsbclb[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_sbclb", [IsStreamingCompatible]>; +def SVSBCLT : SInst<"svsbclt[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_sbclt", [IsStreamingCompatible]>; -def SVADCLB_N : SInst<"svadclb[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_adclb">; -def SVADCLT_N : SInst<"svadclt[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_adclt">; -def SVSBCLB_N : SInst<"svsbclb[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_sbclb">; -def SVSBCLT_N : SInst<"svsbclt[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_sbclt">; +def SVADCLB_N : SInst<"svadclb[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_adclb", [IsStreamingCompatible]>; +def SVADCLT_N : SInst<"svadclt[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_adclt", [IsStreamingCompatible]>; +def SVSBCLB_N : SInst<"svsbclb[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_sbclb", [IsStreamingCompatible]>; +def SVSBCLT_N : SInst<"svsbclt[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_sbclt", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 - Multiplication by indexed elements let ArchGuard = "defined(__ARM_FEATURE_SVE2)" in { -def SVMLA_LANE_2 : SInst<"svmla_lane[_{d}]", "ddddi", "silUsUiUl", MergeNone, "aarch64_sve_mla_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLS_LANE_2 : SInst<"svmls_lane[_{d}]", "ddddi", "silUsUiUl", MergeNone, "aarch64_sve_mls_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMUL_LANE_2 : SInst<"svmul_lane[_{d}]", "dddi", "silUsUiUl", MergeNone, "aarch64_sve_mul_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVMLA_LANE_2 : SInst<"svmla_lane[_{d}]", "ddddi", "silUsUiUl", MergeNone, "aarch64_sve_mla_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLS_LANE_2 : SInst<"svmls_lane[_{d}]", "ddddi", "silUsUiUl", MergeNone, "aarch64_sve_mls_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMUL_LANE_2 : SInst<"svmul_lane[_{d}]", "dddi", "silUsUiUl", MergeNone, "aarch64_sve_mul_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 - Uniform complex integer arithmetic let ArchGuard = "defined(__ARM_FEATURE_SVE2)" in { -def SVCADD : SInst<"svcadd[_{d}]", "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_cadd_x", [], [ImmCheck<2, ImmCheckComplexRot90_270>]>; -def SVSQCADD : SInst<"svqcadd[_{d}]", "dddi", "csil", MergeNone, "aarch64_sve_sqcadd_x", [], [ImmCheck<2, ImmCheckComplexRot90_270>]>; -def SVCMLA : SInst<"svcmla[_{d}]", "ddddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmla_x", [], [ImmCheck<3, ImmCheckComplexRotAll90>]>; -def SVCMLA_LANE_X : SInst<"svcmla_lane[_{d}]", "ddddii", "siUsUi", MergeNone, "aarch64_sve_cmla_lane_x", [], [ImmCheck<3, ImmCheckLaneIndexCompRotate, 2>, +def SVCADD : SInst<"svcadd[_{d}]", "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_cadd_x", [IsStreamingCompatible], [ImmCheck<2, ImmCheckComplexRot90_270>]>; +def SVSQCADD : SInst<"svqcadd[_{d}]", "dddi", "csil", MergeNone, "aarch64_sve_sqcadd_x", [IsStreamingCompatible], [ImmCheck<2, ImmCheckComplexRot90_270>]>; +def SVCMLA : SInst<"svcmla[_{d}]", "ddddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmla_x", [IsStreamingCompatible], [ImmCheck<3, ImmCheckComplexRotAll90>]>; +def SVCMLA_LANE_X : SInst<"svcmla_lane[_{d}]", "ddddii", "siUsUi", MergeNone, "aarch64_sve_cmla_lane_x", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndexCompRotate, 2>, ImmCheck<4, ImmCheckComplexRotAll90>]>; -def SVSQRDCMLAH_X : SInst<"svqrdcmlah[_{d}]", "ddddi", "csil", MergeNone, "aarch64_sve_sqrdcmlah_x", [], [ImmCheck<3, ImmCheckComplexRotAll90>]>; -def SVSQRDCMLAH_LANE_X : SInst<"svqrdcmlah_lane[_{d}]", "ddddii", "si", MergeNone, "aarch64_sve_sqrdcmlah_lane_x", [], [ImmCheck<3, ImmCheckLaneIndexCompRotate, 2>, +def SVSQRDCMLAH_X : SInst<"svqrdcmlah[_{d}]", "ddddi", "csil", MergeNone, "aarch64_sve_sqrdcmlah_x", [IsStreamingCompatible], [ImmCheck<3, ImmCheckComplexRotAll90>]>; +def SVSQRDCMLAH_LANE_X : SInst<"svqrdcmlah_lane[_{d}]", "ddddii", "si", MergeNone, "aarch64_sve_sqrdcmlah_lane_x", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndexCompRotate, 2>, ImmCheck<4, ImmCheckComplexRotAll90>]>; } @@ -1668,18 +1450,18 @@ // SVE2 - Widening DSP operations multiclass SInstWideDSPAcc { - def : SInst; - def _N : SInst; + def : SInst; + def _N : SInst; } multiclass SInstWideDSPLong { - def : SInst; - def _N : SInst; + def : SInst; + def _N : SInst; } multiclass SInstWideDSPWide { - def : SInst; - def _N : SInst; + def : SInst; + def _N : SInst; } let ArchGuard = "defined(__ARM_FEATURE_SVE2)" in { @@ -1728,87 +1510,87 @@ defm SVSUBWT_S : SInstWideDSPWide<"svsubwt", "sil", "aarch64_sve_ssubwt">; defm SVSUBWT_U : SInstWideDSPWide<"svsubwt", "UsUiUl", "aarch64_sve_usubwt">; -def SVSHLLB_S_N : SInst<"svshllb[_n_{d}]", "dhi", "sil", MergeNone, "aarch64_sve_sshllb", [], [ImmCheck<1, ImmCheckShiftLeft, 0>]>; -def SVSHLLB_U_N : SInst<"svshllb[_n_{d}]", "dhi", "UsUiUl", MergeNone, "aarch64_sve_ushllb", [], [ImmCheck<1, ImmCheckShiftLeft, 0>]>; -def SVSHLLT_S_N : SInst<"svshllt[_n_{d}]", "dhi", "sil", MergeNone, "aarch64_sve_sshllt", [], [ImmCheck<1, ImmCheckShiftLeft, 0>]>; -def SVSHLLT_U_N : SInst<"svshllt[_n_{d}]", "dhi", "UsUiUl", MergeNone, "aarch64_sve_ushllt", [], [ImmCheck<1, ImmCheckShiftLeft, 0>]>; - -def SVMOVLB_S_N : SInst<"svmovlb[_{d}]", "dh", "sil", MergeNone>; -def SVMOVLB_U_N : SInst<"svmovlb[_{d}]", "dh", "UsUiUl", MergeNone>; -def SVMOVLT_S_N : SInst<"svmovlt[_{d}]", "dh", "sil", MergeNone>; -def SVMOVLT_U_N : SInst<"svmovlt[_{d}]", "dh", "UsUiUl", MergeNone>; - -def SVMLALB_S_LANE : SInst<"svmlalb_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_smlalb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLALB_U_LANE : SInst<"svmlalb_lane[_{d}]", "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlalb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLALT_S_LANE : SInst<"svmlalt_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_smlalt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLALT_U_LANE : SInst<"svmlalt_lane[_{d}]", "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlalt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLSLB_S_LANE : SInst<"svmlslb_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_smlslb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLSLB_U_LANE : SInst<"svmlslb_lane[_{d}]", "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlslb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLSLT_S_LANE : SInst<"svmlslt_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_smlslt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLSLT_U_LANE : SInst<"svmlslt_lane[_{d}]", "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlslt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMULLB_S_LANE : SInst<"svmullb_lane[_{d}]", "dhhi", "il", MergeNone, "aarch64_sve_smullb_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; -def SVMULLB_U_LANE : SInst<"svmullb_lane[_{d}]", "dhhi", "UiUl", MergeNone, "aarch64_sve_umullb_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; -def SVMULLT_S_LANE : SInst<"svmullt_lane[_{d}]", "dhhi", "il", MergeNone, "aarch64_sve_smullt_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; -def SVMULLT_U_LANE : SInst<"svmullt_lane[_{d}]", "dhhi", "UiUl", MergeNone, "aarch64_sve_umullt_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; -def SVQDMLALB_LANE : SInst<"svqdmlalb_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_sqdmlalb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVQDMLALT_LANE : SInst<"svqdmlalt_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_sqdmlalt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVQDMLSLB_LANE : SInst<"svqdmlslb_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_sqdmlslb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVQDMLSLT_LANE : SInst<"svqdmlslt_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_sqdmlslt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVQDMULLB_LANE : SInst<"svqdmullb_lane[_{d}]", "dhhi", "il", MergeNone, "aarch64_sve_sqdmullb_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; -def SVQDMULLT_LANE : SInst<"svqdmullt_lane[_{d}]", "dhhi", "il", MergeNone, "aarch64_sve_sqdmullt_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVSHLLB_S_N : SInst<"svshllb[_n_{d}]", "dhi", "sil", MergeNone, "aarch64_sve_sshllb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftLeft, 0>]>; +def SVSHLLB_U_N : SInst<"svshllb[_n_{d}]", "dhi", "UsUiUl", MergeNone, "aarch64_sve_ushllb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftLeft, 0>]>; +def SVSHLLT_S_N : SInst<"svshllt[_n_{d}]", "dhi", "sil", MergeNone, "aarch64_sve_sshllt", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftLeft, 0>]>; +def SVSHLLT_U_N : SInst<"svshllt[_n_{d}]", "dhi", "UsUiUl", MergeNone, "aarch64_sve_ushllt", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftLeft, 0>]>; + +def SVMOVLB_S_N : SInst<"svmovlb[_{d}]", "dh", "sil", MergeNone, "", [IsStreamingCompatible]>; +def SVMOVLB_U_N : SInst<"svmovlb[_{d}]", "dh", "UsUiUl", MergeNone, "", [IsStreamingCompatible]>; +def SVMOVLT_S_N : SInst<"svmovlt[_{d}]", "dh", "sil", MergeNone, "", [IsStreamingCompatible]>; +def SVMOVLT_U_N : SInst<"svmovlt[_{d}]", "dh", "UsUiUl", MergeNone, "", [IsStreamingCompatible]>; + +def SVMLALB_S_LANE : SInst<"svmlalb_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_smlalb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLALB_U_LANE : SInst<"svmlalb_lane[_{d}]", "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlalb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLALT_S_LANE : SInst<"svmlalt_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_smlalt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLALT_U_LANE : SInst<"svmlalt_lane[_{d}]", "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlalt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLSLB_S_LANE : SInst<"svmlslb_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_smlslb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLSLB_U_LANE : SInst<"svmlslb_lane[_{d}]", "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlslb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLSLT_S_LANE : SInst<"svmlslt_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_smlslt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLSLT_U_LANE : SInst<"svmlslt_lane[_{d}]", "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlslt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMULLB_S_LANE : SInst<"svmullb_lane[_{d}]", "dhhi", "il", MergeNone, "aarch64_sve_smullb_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVMULLB_U_LANE : SInst<"svmullb_lane[_{d}]", "dhhi", "UiUl", MergeNone, "aarch64_sve_umullb_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVMULLT_S_LANE : SInst<"svmullt_lane[_{d}]", "dhhi", "il", MergeNone, "aarch64_sve_smullt_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVMULLT_U_LANE : SInst<"svmullt_lane[_{d}]", "dhhi", "UiUl", MergeNone, "aarch64_sve_umullt_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVQDMLALB_LANE : SInst<"svqdmlalb_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_sqdmlalb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVQDMLALT_LANE : SInst<"svqdmlalt_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_sqdmlalt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVQDMLSLB_LANE : SInst<"svqdmlslb_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_sqdmlslb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVQDMLSLT_LANE : SInst<"svqdmlslt_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_sqdmlslt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVQDMULLB_LANE : SInst<"svqdmullb_lane[_{d}]", "dhhi", "il", MergeNone, "aarch64_sve_sqdmullb_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVQDMULLT_LANE : SInst<"svqdmullt_lane[_{d}]", "dhhi", "il", MergeNone, "aarch64_sve_sqdmullt_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 - Narrowing DSP operations let ArchGuard = "defined(__ARM_FEATURE_SVE2)" in { -def SVADDHNB : SInst<"svaddhnb[_{d}]", "hdd", "silUsUiUl", MergeNone, "aarch64_sve_addhnb">; -def SVADDHNT : SInst<"svaddhnt[_{d}]", "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_addhnt">; -def SVRADDHNB : SInst<"svraddhnb[_{d}]", "hdd", "silUsUiUl", MergeNone, "aarch64_sve_raddhnb">; -def SVRADDHNT : SInst<"svraddhnt[_{d}]", "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_raddhnt">; -def SVRSUBHNB : SInst<"svrsubhnb[_{d}]", "hdd", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnb">; -def SVRSUBHNT : SInst<"svrsubhnt[_{d}]", "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnt">; -def SVSUBHNB : SInst<"svsubhnb[_{d}]", "hdd", "silUsUiUl", MergeNone, "aarch64_sve_subhnb">; -def SVSUBHNT : SInst<"svsubhnt[_{d}]", "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_subhnt">; - -def SVADDHNB_N : SInst<"svaddhnb[_n_{d}]", "hda", "silUsUiUl", MergeNone, "aarch64_sve_addhnb">; -def SVADDHNT_N : SInst<"svaddhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_addhnt">; -def SVRADDHNB_N : SInst<"svraddhnb[_n_{d}]", "hda", "silUsUiUl", MergeNone, "aarch64_sve_raddhnb">; -def SVRADDHNT_N : SInst<"svraddhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_raddhnt">; -def SVRSUBHNB_N : SInst<"svrsubhnb[_n_{d}]", "hda", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnb">; -def SVRSUBHNT_N : SInst<"svrsubhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnt">; -def SVSUBHNB_N : SInst<"svsubhnb[_n_{d}]", "hda", "silUsUiUl", MergeNone, "aarch64_sve_subhnb">; -def SVSUBHNT_N : SInst<"svsubhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_subhnt">; - -def SVSHRNB : SInst<"svshrnb[_n_{d}]", "hdi", "silUsUiUl", MergeNone, "aarch64_sve_shrnb", [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; -def SVRSHRNB : SInst<"svrshrnb[_n_{d}]", "hdi", "silUsUiUl", MergeNone, "aarch64_sve_rshrnb", [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; -def SVQSHRUNB : SInst<"svqshrunb[_n_{d}]", "edi", "sil", MergeNone, "aarch64_sve_sqshrunb", [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; -def SVQRSHRUNB : SInst<"svqrshrunb[_n_{d}]", "edi", "sil", MergeNone, "aarch64_sve_sqrshrunb", [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; -def SVQSHRNB_S : SInst<"svqshrnb[_n_{d}]", "hdi", "sil", MergeNone, "aarch64_sve_sqshrnb", [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; -def SVQSHRNB_U : SInst<"svqshrnb[_n_{d}]", "hdi", "UsUiUl", MergeNone, "aarch64_sve_uqshrnb", [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; -def SVQRSHRNB_S : SInst<"svqrshrnb[_n_{d}]", "hdi", "sil", MergeNone, "aarch64_sve_sqrshrnb", [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; -def SVQRSHRNB_U : SInst<"svqrshrnb[_n_{d}]", "hdi", "UsUiUl", MergeNone, "aarch64_sve_uqrshrnb", [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; - -def SVSHRNT : SInst<"svshrnt[_n_{d}]", "hhdi", "silUsUiUl", MergeNone, "aarch64_sve_shrnt", [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; -def SVRSHRNT : SInst<"svrshrnt[_n_{d}]", "hhdi", "silUsUiUl", MergeNone, "aarch64_sve_rshrnt", [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; -def SVQSHRUNT : SInst<"svqshrunt[_n_{d}]", "eedi", "sil", MergeNone, "aarch64_sve_sqshrunt", [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; -def SVQRSHRUNT : SInst<"svqrshrunt[_n_{d}]", "eedi", "sil", MergeNone, "aarch64_sve_sqrshrunt", [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; -def SVQSHRNT_S : SInst<"svqshrnt[_n_{d}]", "hhdi", "sil", MergeNone, "aarch64_sve_sqshrnt", [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; -def SVQSHRNT_U : SInst<"svqshrnt[_n_{d}]", "hhdi", "UsUiUl", MergeNone, "aarch64_sve_uqshrnt", [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; -def SVQRSHRNT_S : SInst<"svqrshrnt[_n_{d}]", "hhdi", "sil", MergeNone, "aarch64_sve_sqrshrnt", [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; -def SVQRSHRNT_U : SInst<"svqrshrnt[_n_{d}]", "hhdi", "UsUiUl", MergeNone, "aarch64_sve_uqrshrnt", [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; +def SVADDHNB : SInst<"svaddhnb[_{d}]", "hdd", "silUsUiUl", MergeNone, "aarch64_sve_addhnb", [IsStreamingCompatible]>; +def SVADDHNT : SInst<"svaddhnt[_{d}]", "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_addhnt", [IsStreamingCompatible]>; +def SVRADDHNB : SInst<"svraddhnb[_{d}]", "hdd", "silUsUiUl", MergeNone, "aarch64_sve_raddhnb", [IsStreamingCompatible]>; +def SVRADDHNT : SInst<"svraddhnt[_{d}]", "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_raddhnt", [IsStreamingCompatible]>; +def SVRSUBHNB : SInst<"svrsubhnb[_{d}]", "hdd", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnb", [IsStreamingCompatible]>; +def SVRSUBHNT : SInst<"svrsubhnt[_{d}]", "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnt", [IsStreamingCompatible]>; +def SVSUBHNB : SInst<"svsubhnb[_{d}]", "hdd", "silUsUiUl", MergeNone, "aarch64_sve_subhnb", [IsStreamingCompatible]>; +def SVSUBHNT : SInst<"svsubhnt[_{d}]", "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_subhnt", [IsStreamingCompatible]>; + +def SVADDHNB_N : SInst<"svaddhnb[_n_{d}]", "hda", "silUsUiUl", MergeNone, "aarch64_sve_addhnb", [IsStreamingCompatible]>; +def SVADDHNT_N : SInst<"svaddhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_addhnt", [IsStreamingCompatible]>; +def SVRADDHNB_N : SInst<"svraddhnb[_n_{d}]", "hda", "silUsUiUl", MergeNone, "aarch64_sve_raddhnb", [IsStreamingCompatible]>; +def SVRADDHNT_N : SInst<"svraddhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_raddhnt", [IsStreamingCompatible]>; +def SVRSUBHNB_N : SInst<"svrsubhnb[_n_{d}]", "hda", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnb", [IsStreamingCompatible]>; +def SVRSUBHNT_N : SInst<"svrsubhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnt", [IsStreamingCompatible]>; +def SVSUBHNB_N : SInst<"svsubhnb[_n_{d}]", "hda", "silUsUiUl", MergeNone, "aarch64_sve_subhnb", [IsStreamingCompatible]>; +def SVSUBHNT_N : SInst<"svsubhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_subhnt", [IsStreamingCompatible]>; + +def SVSHRNB : SInst<"svshrnb[_n_{d}]", "hdi", "silUsUiUl", MergeNone, "aarch64_sve_shrnb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; +def SVRSHRNB : SInst<"svrshrnb[_n_{d}]", "hdi", "silUsUiUl", MergeNone, "aarch64_sve_rshrnb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; +def SVQSHRUNB : SInst<"svqshrunb[_n_{d}]", "edi", "sil", MergeNone, "aarch64_sve_sqshrunb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; +def SVQRSHRUNB : SInst<"svqrshrunb[_n_{d}]", "edi", "sil", MergeNone, "aarch64_sve_sqrshrunb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; +def SVQSHRNB_S : SInst<"svqshrnb[_n_{d}]", "hdi", "sil", MergeNone, "aarch64_sve_sqshrnb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; +def SVQSHRNB_U : SInst<"svqshrnb[_n_{d}]", "hdi", "UsUiUl", MergeNone, "aarch64_sve_uqshrnb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; +def SVQRSHRNB_S : SInst<"svqrshrnb[_n_{d}]", "hdi", "sil", MergeNone, "aarch64_sve_sqrshrnb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; +def SVQRSHRNB_U : SInst<"svqrshrnb[_n_{d}]", "hdi", "UsUiUl", MergeNone, "aarch64_sve_uqrshrnb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; + +def SVSHRNT : SInst<"svshrnt[_n_{d}]", "hhdi", "silUsUiUl", MergeNone, "aarch64_sve_shrnt", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; +def SVRSHRNT : SInst<"svrshrnt[_n_{d}]", "hhdi", "silUsUiUl", MergeNone, "aarch64_sve_rshrnt", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; +def SVQSHRUNT : SInst<"svqshrunt[_n_{d}]", "eedi", "sil", MergeNone, "aarch64_sve_sqshrunt", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; +def SVQRSHRUNT : SInst<"svqrshrunt[_n_{d}]", "eedi", "sil", MergeNone, "aarch64_sve_sqrshrunt", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; +def SVQSHRNT_S : SInst<"svqshrnt[_n_{d}]", "hhdi", "sil", MergeNone, "aarch64_sve_sqshrnt", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; +def SVQSHRNT_U : SInst<"svqshrnt[_n_{d}]", "hhdi", "UsUiUl", MergeNone, "aarch64_sve_uqshrnt", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; +def SVQRSHRNT_S : SInst<"svqrshrnt[_n_{d}]", "hhdi", "sil", MergeNone, "aarch64_sve_sqrshrnt", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; +def SVQRSHRNT_U : SInst<"svqrshrnt[_n_{d}]", "hhdi", "UsUiUl", MergeNone, "aarch64_sve_uqrshrnt", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 - Unary narrowing operations let ArchGuard = "defined(__ARM_FEATURE_SVE2)" in { -def SVQXTNB_S : SInst<"svqxtnb[_{d}]", "hd", "sil", MergeNone, "aarch64_sve_sqxtnb">; -def SVQXTNB_U : SInst<"svqxtnb[_{d}]", "hd", "UsUiUl", MergeNone, "aarch64_sve_uqxtnb">; -def SVQXTUNB_S : SInst<"svqxtunb[_{d}]", "ed", "sil", MergeNone, "aarch64_sve_sqxtunb">; +def SVQXTNB_S : SInst<"svqxtnb[_{d}]", "hd", "sil", MergeNone, "aarch64_sve_sqxtnb", [IsStreamingCompatible]>; +def SVQXTNB_U : SInst<"svqxtnb[_{d}]", "hd", "UsUiUl", MergeNone, "aarch64_sve_uqxtnb", [IsStreamingCompatible]>; +def SVQXTUNB_S : SInst<"svqxtunb[_{d}]", "ed", "sil", MergeNone, "aarch64_sve_sqxtunb", [IsStreamingCompatible]>; -def SVQXTNT_S : SInst<"svqxtnt[_{d}]", "hhd", "sil", MergeNone, "aarch64_sve_sqxtnt">; -def SVQXTNT_U : SInst<"svqxtnt[_{d}]", "hhd", "UsUiUl", MergeNone, "aarch64_sve_uqxtnt">; -def SVQXTUNT_S : SInst<"svqxtunt[_{d}]", "eed", "sil", MergeNone, "aarch64_sve_sqxtunt">; +def SVQXTNT_S : SInst<"svqxtnt[_{d}]", "hhd", "sil", MergeNone, "aarch64_sve_sqxtnt", [IsStreamingCompatible]>; +def SVQXTNT_U : SInst<"svqxtnt[_{d}]", "hhd", "UsUiUl", MergeNone, "aarch64_sve_uqxtnt", [IsStreamingCompatible]>; +def SVQXTUNT_S : SInst<"svqxtunt[_{d}]", "eed", "sil", MergeNone, "aarch64_sve_sqxtunt", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// @@ -1949,18 +1731,19 @@ // SVE2 - Polynomial arithmetic let ArchGuard = "defined(__ARM_FEATURE_SVE2)" in { -def SVEORBT : SInst<"sveorbt[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorbt">; -def SVEORBT_N : SInst<"sveorbt[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorbt">; -def SVEORTB : SInst<"sveortb[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eortb">; -def SVEORTB_N : SInst<"sveortb[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_eortb">; -def SVPMUL : SInst<"svpmul[_{d}]", "ddd", "Uc", MergeNone, "aarch64_sve_pmul">; -def SVPMUL_N : SInst<"svpmul[_n_{d}]", "dda", "Uc", MergeNone, "aarch64_sve_pmul">; -def SVPMULLB : SInst<"svpmullb[_{d}]", "dhh", "UsUl", MergeNone>; -def SVPMULLB_N : SInst<"svpmullb[_n_{d}]", "dhR", "UsUl", MergeNone>; -def SVPMULLB_PAIR : SInst<"svpmullb_pair[_{d}]", "ddd", "UcUi", MergeNone, "aarch64_sve_pmullb_pair">; -def SVPMULLB_PAIR_N : SInst<"svpmullb_pair[_n_{d}]", "dda", "UcUi", MergeNone, "aarch64_sve_pmullb_pair">; -def SVPMULLT : SInst<"svpmullt[_{d}]", "dhh", "UsUl", MergeNone>; -def SVPMULLT_N : SInst<"svpmullt[_n_{d}]", "dhR", "UsUl", MergeNone>; +def SVEORBT : SInst<"sveorbt[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorbt", [IsStreamingCompatible]>; +def SVEORBT_N : SInst<"sveorbt[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorbt", [IsStreamingCompatible]>; +def SVEORTB : SInst<"sveortb[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eortb", [IsStreamingCompatible]>; +def SVEORTB_N : SInst<"sveortb[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_eortb", [IsStreamingCompatible]>; +def SVPMUL : SInst<"svpmul[_{d}]", "ddd", "Uc", MergeNone, "aarch64_sve_pmul", [IsStreamingCompatible]>; +def SVPMUL_N : SInst<"svpmul[_n_{d}]", "dda", "Uc", MergeNone, "aarch64_sve_pmul", [IsStreamingCompatible]>; +def SVPMULLB : SInst<"svpmullb[_{d}]", "dhh", "UsUl", MergeNone, "", [IsStreamingCompatible]>; +def SVPMULLB_N : SInst<"svpmullb[_n_{d}]", "dhR", "UsUl", MergeNone, "", [IsStreamingCompatible]>; +// TODO: Are these AES instructions?! +def SVPMULLB_PAIR : SInst<"svpmullb_pair[_{d}]", "ddd", "UcUi", MergeNone, "aarch64_sve_pmullb_pair", [IsStreamingCompatible]>; +def SVPMULLB_PAIR_N : SInst<"svpmullb_pair[_n_{d}]", "dda", "UcUi", MergeNone, "aarch64_sve_pmullb_pair", [IsStreamingCompatible]>; +def SVPMULLT : SInst<"svpmullt[_{d}]", "dhh", "UsUl", MergeNone, "", [IsStreamingCompatible]>; +def SVPMULLT_N : SInst<"svpmullt[_n_{d}]", "dhR", "UsUl", MergeNone, "", [IsStreamingCompatible]>; def SVPMULLT_PAIR : SInst<"svpmullt_pair[_{d}]", "ddd", "UcUi", MergeNone, "aarch64_sve_pmullt_pair">; def SVPMULLT_PAIR_N : SInst<"svpmullt_pair[_n_{d}]", "dda", "UcUi", MergeNone, "aarch64_sve_pmullt_pair">; } @@ -1969,8 +1752,8 @@ // SVE2 - Complex integer dot product let ArchGuard = "defined(__ARM_FEATURE_SVE2)" in { -def SVCDOT : SInst<"svcdot[_{d}]", "ddqqi", "il", MergeNone, "aarch64_sve_cdot", [], [ImmCheck<3, ImmCheckComplexRotAll90>]>; -def SVCDOT_LANE : SInst<"svcdot_lane[_{d}]", "ddqqii", "il", MergeNone, "aarch64_sve_cdot_lane", [], [ImmCheck<4, ImmCheckComplexRotAll90>, +def SVCDOT : SInst<"svcdot[_{d}]", "ddqqi", "il", MergeNone, "aarch64_sve_cdot", [IsStreamingCompatible], [ImmCheck<3, ImmCheckComplexRotAll90>]>; +def SVCDOT_LANE : SInst<"svcdot_lane[_{d}]", "ddqqii", "il", MergeNone, "aarch64_sve_cdot_lane", [IsStreamingCompatible], [ImmCheck<4, ImmCheckComplexRotAll90>, ImmCheck<3, ImmCheckLaneIndexDot, 2>]>; } @@ -1978,27 +1761,27 @@ // SVE2 - Floating-point widening multiply-accumulate let ArchGuard = "defined(__ARM_FEATURE_SVE2)" in { -def SVMLALB_F : SInst<"svmlalb[_{d}]", "ddhh", "f", MergeNone, "aarch64_sve_fmlalb">; -def SVMLALB_F_N : SInst<"svmlalb[_n_{d}]", "ddhR", "f", MergeNone, "aarch64_sve_fmlalb">; -def SVMLALB_F_LANE : SInst<"svmlalb_lane[_{d}]", "ddhhi", "f", MergeNone, "aarch64_sve_fmlalb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLALT_F : SInst<"svmlalt[_{d}]", "ddhh", "f", MergeNone, "aarch64_sve_fmlalt">; -def SVMLALT_F_N : SInst<"svmlalt[_n_{d}]", "ddhR", "f", MergeNone, "aarch64_sve_fmlalt">; -def SVMLALT_F_LANE : SInst<"svmlalt_lane[_{d}]", "ddhhi", "f", MergeNone, "aarch64_sve_fmlalt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLSLB_F : SInst<"svmlslb[_{d}]", "ddhh", "f", MergeNone, "aarch64_sve_fmlslb">; -def SVMLSLB_F_N : SInst<"svmlslb[_n_{d}]", "ddhR", "f", MergeNone, "aarch64_sve_fmlslb">; -def SVMLSLB_F_LANE : SInst<"svmlslb_lane[_{d}]", "ddhhi", "f", MergeNone, "aarch64_sve_fmlslb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLSLT_F : SInst<"svmlslt[_{d}]", "ddhh", "f", MergeNone, "aarch64_sve_fmlslt">; -def SVMLSLT_F_N : SInst<"svmlslt[_n_{d}]", "ddhR", "f", MergeNone, "aarch64_sve_fmlslt">; -def SVMLSLT_F_LANE : SInst<"svmlslt_lane[_{d}]", "ddhhi", "f", MergeNone, "aarch64_sve_fmlslt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLALB_F : SInst<"svmlalb[_{d}]", "ddhh", "f", MergeNone, "aarch64_sve_fmlalb", [IsStreamingCompatible]>; +def SVMLALB_F_N : SInst<"svmlalb[_n_{d}]", "ddhR", "f", MergeNone, "aarch64_sve_fmlalb", [IsStreamingCompatible]>; +def SVMLALB_F_LANE : SInst<"svmlalb_lane[_{d}]", "ddhhi", "f", MergeNone, "aarch64_sve_fmlalb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLALT_F : SInst<"svmlalt[_{d}]", "ddhh", "f", MergeNone, "aarch64_sve_fmlalt", [IsStreamingCompatible]>; +def SVMLALT_F_N : SInst<"svmlalt[_n_{d}]", "ddhR", "f", MergeNone, "aarch64_sve_fmlalt", [IsStreamingCompatible]>; +def SVMLALT_F_LANE : SInst<"svmlalt_lane[_{d}]", "ddhhi", "f", MergeNone, "aarch64_sve_fmlalt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLSLB_F : SInst<"svmlslb[_{d}]", "ddhh", "f", MergeNone, "aarch64_sve_fmlslb", [IsStreamingCompatible]>; +def SVMLSLB_F_N : SInst<"svmlslb[_n_{d}]", "ddhR", "f", MergeNone, "aarch64_sve_fmlslb", [IsStreamingCompatible]>; +def SVMLSLB_F_LANE : SInst<"svmlslb_lane[_{d}]", "ddhhi", "f", MergeNone, "aarch64_sve_fmlslb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLSLT_F : SInst<"svmlslt[_{d}]", "ddhh", "f", MergeNone, "aarch64_sve_fmlslt", [IsStreamingCompatible]>; +def SVMLSLT_F_N : SInst<"svmlslt[_n_{d}]", "ddhR", "f", MergeNone, "aarch64_sve_fmlslt", [IsStreamingCompatible]>; +def SVMLSLT_F_LANE : SInst<"svmlslt_lane[_{d}]", "ddhhi", "f", MergeNone, "aarch64_sve_fmlslt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 - Floating-point integer binary logarithm let ArchGuard = "defined(__ARM_FEATURE_SVE2)" in { -def SVLOGB_M : SInst<"svlogb[_{d}]", "xxPd", "hfd", MergeOp1, "aarch64_sve_flogb">; -def SVLOGB_X : SInst<"svlogb[_{d}]", "xPd", "hfd", MergeAnyExp, "aarch64_sve_flogb">; -def SVLOGB_Z : SInst<"svlogb[_{d}]", "xPd", "hfd", MergeZeroExp, "aarch64_sve_flogb">; +def SVLOGB_M : SInst<"svlogb[_{d}]", "xxPd", "hfd", MergeOp1, "aarch64_sve_flogb", [IsStreamingCompatible]>; +def SVLOGB_X : SInst<"svlogb[_{d}]", "xPd", "hfd", MergeAnyExp, "aarch64_sve_flogb", [IsStreamingCompatible]>; +def SVLOGB_Z : SInst<"svlogb[_{d}]", "xPd", "hfd", MergeZeroExp, "aarch64_sve_flogb", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// @@ -2020,32 +1803,32 @@ //////////////////////////////////////////////////////////////////////////////// // SVE2 - Contiguous conflict detection let ArchGuard = "defined(__ARM_FEATURE_SVE2)" in { -def SVWHILERW_B : SInst<"svwhilerw[_{1}]", "Pcc", "cUc", MergeNone, "aarch64_sve_whilerw_b", [IsOverloadWhileRW]>; -def SVWHILERW_H : SInst<"svwhilerw[_{1}]", "Pcc", "sUsh", MergeNone, "aarch64_sve_whilerw_h", [IsOverloadWhileRW]>; -def SVWHILERW_S : SInst<"svwhilerw[_{1}]", "Pcc", "iUif", MergeNone, "aarch64_sve_whilerw_s", [IsOverloadWhileRW]>; -def SVWHILERW_D : SInst<"svwhilerw[_{1}]", "Pcc", "lUld", MergeNone, "aarch64_sve_whilerw_d", [IsOverloadWhileRW]>; +def SVWHILERW_B : SInst<"svwhilerw[_{1}]", "Pcc", "cUc", MergeNone, "aarch64_sve_whilerw_b", [IsOverloadWhileRW, IsStreamingCompatible]>; +def SVWHILERW_H : SInst<"svwhilerw[_{1}]", "Pcc", "sUsh", MergeNone, "aarch64_sve_whilerw_h", [IsOverloadWhileRW, IsStreamingCompatible]>; +def SVWHILERW_S : SInst<"svwhilerw[_{1}]", "Pcc", "iUif", MergeNone, "aarch64_sve_whilerw_s", [IsOverloadWhileRW, IsStreamingCompatible]>; +def SVWHILERW_D : SInst<"svwhilerw[_{1}]", "Pcc", "lUld", MergeNone, "aarch64_sve_whilerw_d", [IsOverloadWhileRW, IsStreamingCompatible]>; -def SVWHILEWR_B : SInst<"svwhilewr[_{1}]", "Pcc", "cUc", MergeNone, "aarch64_sve_whilewr_b", [IsOverloadWhileRW]>; -def SVWHILEWR_H : SInst<"svwhilewr[_{1}]", "Pcc", "sUsh", MergeNone, "aarch64_sve_whilewr_h", [IsOverloadWhileRW]>; -def SVWHILEWR_S : SInst<"svwhilewr[_{1}]", "Pcc", "iUif", MergeNone, "aarch64_sve_whilewr_s", [IsOverloadWhileRW]>; -def SVWHILEWR_D : SInst<"svwhilewr[_{1}]", "Pcc", "lUld", MergeNone, "aarch64_sve_whilewr_d", [IsOverloadWhileRW]>; +def SVWHILEWR_B : SInst<"svwhilewr[_{1}]", "Pcc", "cUc", MergeNone, "aarch64_sve_whilewr_b", [IsOverloadWhileRW, IsStreamingCompatible]>; +def SVWHILEWR_H : SInst<"svwhilewr[_{1}]", "Pcc", "sUsh", MergeNone, "aarch64_sve_whilewr_h", [IsOverloadWhileRW, IsStreamingCompatible]>; +def SVWHILEWR_S : SInst<"svwhilewr[_{1}]", "Pcc", "iUif", MergeNone, "aarch64_sve_whilewr_s", [IsOverloadWhileRW, IsStreamingCompatible]>; +def SVWHILEWR_D : SInst<"svwhilewr[_{1}]", "Pcc", "lUld", MergeNone, "aarch64_sve_whilewr_d", [IsOverloadWhileRW, IsStreamingCompatible]>; } let ArchGuard = "defined(__ARM_FEATURE_SVE2) && defined(__ARM_FEATURE_BF16_SCALAR_ARITHMETIC)" in { -def SVWHILERW_H_BF16 : SInst<"svwhilerw[_{1}]", "Pcc", "b", MergeNone, "aarch64_sve_whilerw_h", [IsOverloadWhileRW]>; -def SVWHILEWR_H_BF16 : SInst<"svwhilewr[_{1}]", "Pcc", "b", MergeNone, "aarch64_sve_whilewr_h", [IsOverloadWhileRW]>; +def SVWHILERW_H_BF16 : SInst<"svwhilerw[_{1}]", "Pcc", "b", MergeNone, "aarch64_sve_whilerw_h", [IsOverloadWhileRW, IsStreamingCompatible]>; +def SVWHILEWR_H_BF16 : SInst<"svwhilewr[_{1}]", "Pcc", "b", MergeNone, "aarch64_sve_whilewr_h", [IsOverloadWhileRW, IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 - Extended table lookup/permute let ArchGuard = "defined(__ARM_FEATURE_SVE2)" in { -def SVTBL2 : SInst<"svtbl2[_{d}]", "d2u", "csilUcUsUiUlhfd", MergeNone>; -def SVTBX : SInst<"svtbx[_{d}]", "dddu", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_tbx">; +def SVTBL2 : SInst<"svtbl2[_{d}]", "d2u", "csilUcUsUiUlhfd", MergeNone, "", [IsStreamingCompatible]>; +def SVTBX : SInst<"svtbx[_{d}]", "dddu", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_tbx", [IsStreamingCompatible]>; } let ArchGuard = "defined(__ARM_FEATURE_SVE2) && defined(__ARM_FEATURE_SVE_BF16)" in { -def SVTBL2_BF16 : SInst<"svtbl2[_{d}]", "d2u", "b", MergeNone>; -def SVTBX_BF16 : SInst<"svtbx[_{d}]", "dddu", "b", MergeNone, "aarch64_sve_tbx">; +def SVTBL2_BF16 : SInst<"svtbl2[_{d}]", "d2u", "b", MergeNone, "", [IsStreamingCompatible]>; +def SVTBX_BF16 : SInst<"svtbx[_{d}]", "dddu", "b", MergeNone, "aarch64_sve_tbx", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// @@ -2081,3 +1864,97 @@ def SVBGRP : SInst<"svbgrp[_{d}]", "ddd", "UcUsUiUl", MergeNone, "aarch64_sve_bgrp_x">; def SVBGRP_N : SInst<"svbgrp[_n_{d}]", "dda", "UcUsUiUl", MergeNone, "aarch64_sve_bgrp_x">; } + +//////////////////////////////////////////////////////////////////////////////// +// SME + +let ArchGuard = "defined(__ARM_FEATURE_SME)" in { +def SVSCLAMP : SInst<"svsclamp[_{d}]", "dddd", "csil", MergeNone, "aarch64_sve_sclamp">; +def SVUCLAMP : SInst<"svuclamp[_{d}]", "dddd", "UcUsUiUl", MergeNone, "aarch64_sve_uclamp">; + +defm SVREVD : SInstZPZ<"svrevd", "csilUcUsUiUl", "aarch64_sve_revd">; + +def SVPSEL : SInst<"svpsel[_{d}]", "PPPk", "PcPsPiPl", MergeNone, "aarch64_sve_psel">; +} + + +let ArchGuard = "defined(__ARM_FEATURE_SME2)" in { +def SVPTRUE_COUNT_C8 : SInst<"svptrue_{d}", "}v", "Qc", MergeNone, "aarch64_sve_ptrue_c8", [IsStreaming, IsOverloadNone], []>; +def SVPTRUE_COUNT_C16 : SInst<"svptrue_{d}", "}v", "Qs", MergeNone, "aarch64_sve_ptrue_c16", [IsStreaming, IsOverloadNone], []>; +def SVPTRUE_COUNT_C32 : SInst<"svptrue_{d}", "}v", "Qi", MergeNone, "aarch64_sve_ptrue_c32", [IsStreaming, IsOverloadNone], []>; +def SVPTRUE_COUNT_C64 : SInst<"svptrue_{d}", "}v", "Ql", MergeNone, "aarch64_sve_ptrue_c64", [IsStreaming, IsOverloadNone], []>; + +def SVPEXT_SINGLE : SInst<"svpext_lane_{d}", "P}i", "QcQsQiQl", MergeNone, "aarch64_sve_pext", [IsStreaming], [ImmCheck<1, ImmCheck0_3>]>; + +def SVLD1B_VG2 : MInst<"svld1b[_{2}]_x2", "2}c", "cUc", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ld1_pn_vg2">; +def SVLD1H_VG2 : MInst<"svld1h[_{2}]_x2", "2}c", "sUshb", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ld1_pn_vg2">; +def SVLD1W_VG2 : MInst<"svld1w[_{2}]_x2", "2}c", "iUif", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ld1_pn_vg2">; +def SVLD1D_VG2 : MInst<"svld1d[_{2}]_x2", "2}c", "lUld", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ld1_pn_vg2">; +def SVLD1B_VG4 : MInst<"svld1b[_{2}]_x4", "4}c", "cUc", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ld1_pn_vg4">; +def SVLD1H_VG4 : MInst<"svld1h[_{2}]_x4", "4}c", "sUshb", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ld1_pn_vg4">; +def SVLD1W_VG4 : MInst<"svld1w[_{2}]_x4", "4}c", "iUif", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ld1_pn_vg4">; +def SVLD1D_VG4 : MInst<"svld1d[_{2}]_x4", "4}c", "lUld", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ld1_pn_vg4">; + +def SVLDNT1B_VG2 : MInst<"svldnt1b[_{2}]_x2", "2}c", "cUc", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ldnt1_pn_vg2">; +def SVLDNT1H_VG2 : MInst<"svldnt1h[_{2}]_x2", "2}c", "sUshb", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ldnt1_pn_vg2">; +def SVLDNT1W_VG2 : MInst<"svldnt1w[_{2}]_x2", "2}c", "iUif", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ldnt1_pn_vg2">; +def SVLDNT1D_VG2 : MInst<"svldnt1d[_{2}]_x2", "2}c", "lUld", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ldnt1_pn_vg2">; +def SVLDNT1B_VG4 : MInst<"svldnt1b[_{2}]_x4", "4}c", "cUc", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ldnt1_pn_vg4">; +def SVLDNT1H_VG4 : MInst<"svldnt1h[_{2}]_x4", "4}c", "sUshb", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ldnt1_pn_vg4">; +def SVLDNT1W_VG4 : MInst<"svldnt1w[_{2}]_x4", "4}c", "iUif", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ldnt1_pn_vg4">; +def SVLDNT1D_VG4 : MInst<"svldnt1d[_{2}]_x4", "4}c", "lUld", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ldnt1_pn_vg4">; + +def SVLD1B_VNUM_VG2 : MInst<"svld1b_vnum[_{2}]_x2", "2}cl", "cUc", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ld1_pn_vg2">; +def SVLD1H_VNUM_VG2 : MInst<"svld1h_vnum[_{2}]_x2", "2}cl", "sUshb", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ld1_pn_vg2">; +def SVLD1W_VNUM_VG2 : MInst<"svld1w_vnum[_{2}]_x2", "2}cl", "iUif", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ld1_pn_vg2">; +def SVLD1D_VNUM_VG2 : MInst<"svld1d_vnum[_{2}]_x2", "2}cl", "lUld", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ld1_pn_vg2">; +def SVLD1B_VNUM_VG4 : MInst<"svld1b_vnum[_{2}]_x4", "4}cl", "cUc", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ld1_pn_vg4">; +def SVLD1H_VNUM_VG4 : MInst<"svld1h_vnum[_{2}]_x4", "4}cl", "sUshb", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ld1_pn_vg4">; +def SVLD1W_VNUM_VG4 : MInst<"svld1w_vnum[_{2}]_x4", "4}cl", "iUif", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ld1_pn_vg4">; +def SVLD1D_VNUM_VG4 : MInst<"svld1d_vnum[_{2}]_x4", "4}cl", "lUld", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ld1_pn_vg4">; + +def SVLDNT1B_VNUM_VG2 : MInst<"svldnt1b_vnum[_{2}]_x2", "2}cl", "cUc", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ldnt1_pn_vg2">; +def SVLDNT1H_VNUM_VG2 : MInst<"svldnt1h_vnum[_{2}]_x2", "2}cl", "sUshb", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ldnt1_pn_vg2">; +def SVLDNT1W_VNUM_VG2 : MInst<"svldnt1w_vnum[_{2}]_x2", "2}cl", "iUif", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ldnt1_pn_vg2">; +def SVLDNT1D_VNUM_VG2 : MInst<"svldnt1d_vnum[_{2}]_x2", "2}cl", "lUld", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ldnt1_pn_vg2">; +def SVLDNT1B_VNUM_VG4 : MInst<"svldnt1b_vnum[_{2}]_x4", "4}cl", "cUc", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ldnt1_pn_vg4">; +def SVLDNT1H_VNUM_VG4 : MInst<"svldnt1h_vnum[_{2}]_x4", "4}cl", "sUshb", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ldnt1_pn_vg4">; +def SVLDNT1W_VNUM_VG4 : MInst<"svldnt1w_vnum[_{2}]_x4", "4}cl", "iUif", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ldnt1_pn_vg4">; +def SVLDNT1D_VNUM_VG4 : MInst<"svldnt1d_vnum[_{2}]_x4", "4}cl", "lUld", [IsStructLoad, IsStreaming], MemEltTyDefault, "aarch64_sve_ldnt1_pn_vg4">; + +def SVST1B_VG2 : MInst<"svst1b[_{2}]_x2", "v}p2", "cUc", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_st1_pn_vg2">; +def SVST1H_VG2 : MInst<"svst1h[_{2}]_x2", "v}p2", "sUshb", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_st1_pn_vg2">; +def SVST1W_VG2 : MInst<"svst1w[_{2}]_x2", "v}p2", "iUif", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_st1_pn_vg2">; +def SVST1D_VG2 : MInst<"svst1d[_{2}]_x2", "v}p2", "lUld", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_st1_pn_vg2">; +def SVST1B_VG4 : MInst<"svst1b[_{2}]_x4", "v}p4", "cUc", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_st1_pn_vg4">; +def SVST1H_VG4 : MInst<"svst1h[_{2}]_x4", "v}p4", "sUshb", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_st1_pn_vg4">; +def SVST1W_VG4 : MInst<"svst1w[_{2}]_x4", "v}p4", "iUif", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_st1_pn_vg4">; +def SVST1D_VG4 : MInst<"svst1d[_{2}]_x4", "v}p4", "lUld", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_st1_pn_vg4">; + +def SVST1B_VNUM_VG2 : MInst<"svst1b_vnum[_{2}]_x2", "v}pl2", "cUc", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_st1_pn_vg2">; +def SVST1H_VNUM_VG2 : MInst<"svst1h_vnum[_{2}]_x2", "v}pl2", "sUshb", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_st1_pn_vg2">; +def SVST1W_VNUM_VG2 : MInst<"svst1w_vnum[_{2}]_x2", "v}pl2", "iUif", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_st1_pn_vg2">; +def SVST1D_VNUM_VG2 : MInst<"svst1d_vnum[_{2}]_x2", "v}pl2", "lUld", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_st1_pn_vg2">; +def SVST1B_VNUM_VG4 : MInst<"svst1b_vnum[_{2}]_x4", "v}pl4", "cUc", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_st1_pn_vg4">; +def SVST1H_VNUM_VG4 : MInst<"svst1h_vnum[_{2}]_x4", "v}pl4", "sUshb", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_st1_pn_vg4">; +def SVST1W_VNUM_VG4 : MInst<"svst1w_vnum[_{2}]_x4", "v}pl4", "iUif", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_st1_pn_vg4">; +def SVST1D_VNUM_VG4 : MInst<"svst1d_vnum[_{2}]_x4", "v}pl4", "lUld", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_st1_pn_vg4">; + +def SVSTNT1B_VG2 : MInst<"svstnt1b[_{2}]_x2", "v}p2", "cUc", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_stnt1_pn_vg2">; +def SVSTNT1H_VG2 : MInst<"svstnt1h[_{2}]_x2", "v}p2", "sUshb", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_stnt1_pn_vg2">; +def SVSTNT1W_VG2 : MInst<"svstnt1w[_{2}]_x2", "v}p2", "iUif", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_stnt1_pn_vg2">; +def SVSTNT1D_VG2 : MInst<"svstnt1d[_{2}]_x2", "v}p2", "lUld", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_stnt1_pn_vg2">; +def SVSTNT1B_VG4 : MInst<"svstnt1b[_{2}]_x4", "v}p4", "cUc", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_stnt1_pn_vg4">; +def SVSTNT1H_VG4 : MInst<"svstnt1h[_{2}]_x4", "v}p4", "sUshb", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_stnt1_pn_vg4">; +def SVSTNT1W_VG4 : MInst<"svstnt1w[_{2}]_x4", "v}p4", "iUif", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_stnt1_pn_vg4">; +def SVSTNT1D_VG4 : MInst<"svstnt1d[_{2}]_x4", "v}p4", "lUld", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_stnt1_pn_vg4">; + +def SVSTNT1B_VNUM_VG2 : MInst<"svstnt1b_vnum[_{2}]_x2", "v}pl2", "cUc", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_stnt1_pn_vg2">; +def SVSTNT1H_VNUM_VG2 : MInst<"svstnt1h_vnum[_{2}]_x2", "v}pl2", "sUshb", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_stnt1_pn_vg2">; +def SVSTNT1W_VNUM_VG2 : MInst<"svstnt1w_vnum[_{2}]_x2", "v}pl2", "iUif", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_stnt1_pn_vg2">; +def SVSTNT1D_VNUM_VG2 : MInst<"svstnt1d_vnum[_{2}]_x2", "v}pl2", "lUld", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_stnt1_pn_vg2">; +def SVSTNT1B_VNUM_VG4 : MInst<"svstnt1b_vnum[_{2}]_x4", "v}pl4", "cUc", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_stnt1_pn_vg4">; +def SVSTNT1H_VNUM_VG4 : MInst<"svstnt1h_vnum[_{2}]_x4", "v}pl4", "sUshb", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_stnt1_pn_vg4">; +def SVSTNT1W_VNUM_VG4 : MInst<"svstnt1w_vnum[_{2}]_x4", "v}pl4", "iUif", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_stnt1_pn_vg4">; +def SVSTNT1D_VNUM_VG4 : MInst<"svstnt1d_vnum[_{2}]_x4", "v}pl4", "lUld", [IsStructStore, IsStreaming], MemEltTyDefault, "aarch64_sve_stnt1_pn_vg4">; +} diff --git a/clang/include/clang/Basic/arm_sve_common.td b/clang/include/clang/Basic/arm_sve_common.td new file mode 100644 --- /dev/null +++ b/clang/include/clang/Basic/arm_sve_common.td @@ -0,0 +1,246 @@ +//===----------------------------------------------------------------------===// +// Instruction definitions +//===----------------------------------------------------------------------===// +// Every intrinsic subclasses "Inst". An intrinsic has a name, a prototype and +// a sequence of typespecs. +// +// The name is the base name of the intrinsic, for example "svld1". This is +// then mangled by the tblgen backend to add type information ("svld1_s16"). +// +// A typespec is a sequence of uppercase characters (modifiers) followed by one +// lowercase character. A typespec encodes a particular "base type" of the +// intrinsic. +// +// An example typespec is "Us" - unsigned short - svuint16_t. The available +// typespec codes are given below. +// +// The string given to an Inst class is a sequence of typespecs. The intrinsic +// is instantiated for every typespec in the sequence. For example "sdUsUd". +// +// The prototype is a string that defines the return type of the intrinsic +// and the type of each argument. The return type and every argument gets a +// "modifier" that can change in some way the "base type" of the intrinsic. +// +// The modifier 'd' means "default" and does not modify the base type in any +// way. The available modifiers are given below. +// +// Typespecs +// --------- +// c: char +// s: short +// i: int +// l: long +// f: float +// h: half-float +// d: double +// b: bfloat + +// Typespec modifiers +// ------------------ +// P: boolean +// Q: svcount +// U: unsigned + +// Prototype modifiers +// ------------------- +// prototype: return (arg, arg, ...) +// +// 2,3,4: array of default vectors +// v: void +// x: vector of signed integers +// u: vector of unsigned integers +// d: default +// c: const pointer type +// P: predicate type +// s: scalar of element type +// a: scalar of element type (splat to vector type) +// R: scalar of 1/2 width element type (splat to vector type) +// r: scalar of 1/4 width element type (splat to vector type) +// @: unsigned scalar of 1/4 width element type (splat to vector type) +// e: 1/2 width unsigned elements, 2x element count +// b: 1/4 width unsigned elements, 4x element count +// h: 1/2 width elements, 2x element count +// q: 1/4 width elements, 4x element count +// o: 4x width elements, 1/4 element count +// +// w: vector of element type promoted to 64bits, vector maintains +// signedness of its element type. +// f: element type promoted to uint64_t (splat to vector type) +// j: element type promoted to 64bits (splat to vector type) +// K: element type bitcast to a signed integer (splat to vector type) +// L: element type bitcast to an unsigned integer (splat to vector type) +// +// i: constant uint64_t +// k: int32_t +// l: int64_t +// m: uint32_t +// n: uint64_t +// y: bool + +// t: svint32_t +// z: svuint32_t +// g: svuint64_t +// O: svfloat16_t +// M: svfloat32_t +// N: svfloat64_t + +// J: Prefetch type (sv_prfop) +// p: pointer to element type +// A: pointer to int8_t +// B: pointer to int16_t +// C: pointer to int32_t +// D: pointer to int64_t + +// E: pointer to uint8_t +// F: pointer to uint16_t +// G: pointer to uint32_t +// H: pointer to uint64_t + +// Q: const pointer to void + +// S: const pointer to int8_t +// T: const pointer to int16_t +// U: const pointer to int32_t +// V: const pointer to int64_t +// +// W: const pointer to uint8_t +// X: const pointer to uint16_t +// Y: const pointer to uint32_t +// Z: const pointer to uint64_t + +// Prototype modifiers added for SME +// {: pointer to void +// +// Prototype modifiers added for SVE2p1 +// }: svcount_t + +class MergeType { + int Value = val; + string Suffix = suffix; +} +def MergeNone : MergeType<0>; +def MergeAny : MergeType<1, "_x">; +def MergeOp1 : MergeType<2, "_m">; +def MergeZero : MergeType<3, "_z">; +def MergeAnyExp : MergeType<4, "_x">; // Use merged builtin with explicit +def MergeZeroExp : MergeType<5, "_z">; // generation of its inactive argument. + +class EltType { + int Value = val; +} +def EltTyInvalid : EltType<0>; +def EltTyInt8 : EltType<1>; +def EltTyInt16 : EltType<2>; +def EltTyInt32 : EltType<3>; +def EltTyInt64 : EltType<4>; +def EltTyFloat16 : EltType<5>; +def EltTyFloat32 : EltType<6>; +def EltTyFloat64 : EltType<7>; +def EltTyBool8 : EltType<8>; +def EltTyBool16 : EltType<9>; +def EltTyBool32 : EltType<10>; +def EltTyBool64 : EltType<11>; +def EltTyBFloat16 : EltType<12>; + +class MemEltType { + int Value = val; +} +def MemEltTyDefault : MemEltType<0>; +def MemEltTyInt8 : MemEltType<1>; +def MemEltTyInt16 : MemEltType<2>; +def MemEltTyInt32 : MemEltType<3>; +def MemEltTyInt64 : MemEltType<4>; +def MemEltTyInt128 : MemEltType<5>; + +class FlagType { + int Value = val; +} + +// These must be kept in sync with the flags in utils/TableGen/SveEmitter.cpp +// and include/clang/Basic/TargetBuiltins.h +def NoFlags : FlagType<0x00000000>; +def FirstEltType : FlagType<0x00000001>; +// : : +// : : +def EltTypeMask : FlagType<0x0000000f>; +def FirstMemEltType : FlagType<0x00000010>; +// : : +// : : +def MemEltTypeMask : FlagType<0x00000070>; +def FirstMergeTypeMask : FlagType<0x00000080>; +// : : +// : : +def MergeTypeMask : FlagType<0x00000380>; +def FirstSplatOperand : FlagType<0x00000400>; +// : : +// These flags are used to specify which scalar operand +// needs to be duplicated/splatted into a vector. +// : : +def SplatOperandMask : FlagType<0x00001C00>; +def IsLoad : FlagType<0x00002000>; +def IsStore : FlagType<0x00004000>; +def IsGatherLoad : FlagType<0x00008000>; +def IsScatterStore : FlagType<0x00010000>; +def IsStructLoad : FlagType<0x00020000>; +def IsStructStore : FlagType<0x00040000>; +def IsZExtReturn : FlagType<0x00080000>; // Return value is sign-extend by default +def IsOverloadNone : FlagType<0x00100000>; // Intrinsic does not take any overloaded types. +def IsOverloadWhile : FlagType<0x00200000>; // Use {default type, typeof(operand1)} as overloaded types. +def IsOverloadWhileRW : FlagType<0x00400000>; // Use {pred(default type), typeof(operand0)} as overloaded types. +def IsOverloadCvt : FlagType<0x00800000>; // Use {typeof(operand0), typeof(last operand)} as overloaded types. +def OverloadKindMask : FlagType<0x00E00000>; // When the masked values are all '0', the default type is used as overload type. +def IsByteIndexed : FlagType<0x01000000>; +def IsAppendSVALL : FlagType<0x02000000>; // Appends SV_ALL as the last operand. +def IsInsertOp1SVALL : FlagType<0x04000000>; // Inserts SV_ALL as the second operand. +def IsPrefetch : FlagType<0x08000000>; // Contiguous prefetches. +def IsGatherPrefetch : FlagType<0x10000000>; +def ReverseCompare : FlagType<0x20000000>; // Compare operands must be swapped. +def ReverseUSDOT : FlagType<0x40000000>; // Unsigned/signed operands must be swapped. +def IsUndef : FlagType<0x80000000>; // Codegen `undef` of given type. +def IsTupleCreate : FlagType<0x100000000>; +def IsTupleGet : FlagType<0x200000000>; +def IsTupleSet : FlagType<0x400000000>; +def IsStreaming : FlagType<0x800000000>; +def IsStreamingCompatible : FlagType<0x1000000000>; +def IsSharedZA : FlagType<0x2000000000>; +def IsPreservedZA : FlagType<0x4000000000>; +def IsReadZA : FlagType<0x8000000000>; +def IsWriteZA : FlagType<0x10000000000>; +def IsArmInStreamingMode : FlagType<0x20000000000>; +def IsZASliceBaseOffsetIntr : FlagType<0x40000000000>; + +// These must be kept in sync with the flags in include/clang/Basic/TargetBuiltins.h +class ImmCheckType { + int Value = val; +} +def ImmCheck0_31 : ImmCheckType<0>; // 0..31 (used for e.g. predicate patterns) +def ImmCheck1_16 : ImmCheckType<1>; // 1..16 +def ImmCheckExtract : ImmCheckType<2>; // 0..(2048/sizeinbits(elt) - 1) +def ImmCheckShiftRight : ImmCheckType<3>; // 1..sizeinbits(elt) +def ImmCheckShiftRightNarrow : ImmCheckType<4>; // 1..sizeinbits(elt)/2 +def ImmCheckShiftLeft : ImmCheckType<5>; // 0..(sizeinbits(elt) - 1) +def ImmCheck0_7 : ImmCheckType<6>; // 0..7 +def ImmCheckLaneIndex : ImmCheckType<7>; // 0..(128/(1*sizeinbits(elt)) - 1) +def ImmCheckLaneIndexCompRotate : ImmCheckType<8>; // 0..(128/(2*sizeinbits(elt)) - 1) +def ImmCheckLaneIndexDot : ImmCheckType<9>; // 0..(128/(4*sizeinbits(elt)) - 1) +def ImmCheckComplexRot90_270 : ImmCheckType<10>; // [90,270] +def ImmCheckComplexRotAll90 : ImmCheckType<11>; // [0, 90, 180,270] +def ImmCheck0_13 : ImmCheckType<12>; // 0..13 +def ImmCheck0_1 : ImmCheckType<13>; // 0..1 +def ImmCheck0_2 : ImmCheckType<14>; // 0..2 +def ImmCheck0_3 : ImmCheckType<15>; // 0..3 +def ImmCheck0 : ImmCheckType<16>; // Must be 0. +def ImmCheck0_15 : ImmCheckType<17>; // 0..15 +def ImmCheck0_255 : ImmCheckType<18>; // 0..255 +def ImmCheck0_2_Mul2 : ImmCheckType<19>; // 0, 2 +def ImmCheck0_6_Mul2 : ImmCheckType<20>; // 0, 2, .., 6 +def ImmCheck0_14_Mul2 : ImmCheckType<21>; // 0, 2, .., 14 +def ImmCheck0_4_Mul4 : ImmCheckType<22>; // 0, 4 +def ImmCheck0_12_Mul4 : ImmCheckType<23>; // 0, 4, 8, 12 + + +class ImmCheck { + int Arg = arg; + int EltSizeArg = eltSizeArg; + ImmCheckType Kind = kind; +} diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -6940,6 +6940,8 @@ NestedNameSpecInfo &IdInfo, bool EnteringContext); + bool IsInvalidSMECallConversion(QualType FromType, QualType ToType); + /// The parser has parsed a nested-name-specifier /// 'template[opt] template-name < template-args >::'. /// @@ -13261,7 +13263,10 @@ bool CheckNeonBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, CallExpr *TheCall); bool CheckMVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall); + bool ParseSVEImmChecks(CallExpr *TheCall, + SmallVector, 3> &ImmChecks); bool CheckSVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall); + bool CheckSMEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall); bool CheckCDEBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, CallExpr *TheCall); bool CheckARMCoprocessorImmediate(const TargetInfo &TI, const Expr *CoprocArg, diff --git a/clang/include/clang/module.modulemap b/clang/include/clang/module.modulemap --- a/clang/include/clang/module.modulemap +++ b/clang/include/clang/module.modulemap @@ -49,6 +49,7 @@ textual header "Basic/BuiltinsRISCV.def" textual header "Basic/BuiltinsRISCVVector.def" textual header "Basic/BuiltinsSVE.def" + textual header "Basic/BuiltinsSME.def" textual header "Basic/BuiltinsSystemZ.def" textual header "Basic/BuiltinsVE.def" textual header "Basic/BuiltinsVEVL.gen.def" diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -2265,6 +2265,11 @@ Width = 0; \ Align = 16; \ break; +#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId) \ + case BuiltinType::Id: \ + Width = 0; \ + Align = 16; \ + break; #include "clang/Basic/AArch64SVEACLETypes.def" #define PPC_VECTOR_TYPE(Name, Id, Size) \ case BuiltinType::Id: \ @@ -3963,6 +3968,10 @@ return SVE_INT_ELTTY(64, 2, false, 4); case BuiltinType::SveBool: return SVE_ELTTY(BoolTy, 16, 1); + case BuiltinType::SveBoolx2: + return SVE_ELTTY(BoolTy, 16, 2); + case BuiltinType::SveBoolx4: + return SVE_ELTTY(BoolTy, 16, 4); case BuiltinType::SveFloat16: return SVE_ELTTY(HalfTy, 8, 1); case BuiltinType::SveFloat16x2: @@ -4033,6 +4042,7 @@ #define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId, NumEls) \ if (EltTy->isBooleanType() && NumElts == NumEls) \ return SingletonId; +#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingleTonId) #include "clang/Basic/AArch64SVEACLETypes.def" } else if (Target->hasRISCVVTypes()) { uint64_t EltTySize = getTypeSize(EltTy); @@ -9438,9 +9448,10 @@ /// getSVETypeSize - Return SVE vector or predicate register size. static uint64_t getSVETypeSize(ASTContext &Context, const BuiltinType *Ty) { assert(Ty->isVLSTBuiltinType() && "Invalid SVE Type"); - return Ty->getKind() == BuiltinType::SveBool - ? (Context.getLangOpts().VScaleMin * 128) / Context.getCharWidth() - : Context.getLangOpts().VScaleMin * 128; + if (Ty->getKind() == BuiltinType::SveBool || + Ty->getKind() == BuiltinType::SveCount) + return (Context.getLangOpts().VScaleMin * 128) / Context.getCharWidth(); + return Context.getLangOpts().VScaleMin * 128; } bool ASTContext::areCompatibleSveTypes(QualType FirstType, @@ -11295,6 +11306,10 @@ Type = Context.getScalableVectorType(ElementType, NumElements); break; } + case 'Q': { + Type = Context.SveCountTy; + break; + } case 'V': { char *End; unsigned NumElements = strtoul(Str, &End, 10); diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp --- a/clang/lib/AST/ItaniumMangle.cpp +++ b/clang/lib/AST/ItaniumMangle.cpp @@ -3129,6 +3129,12 @@ Out << (type_name == InternalName ? "u" : "") << type_name.size() \ << type_name; \ break; +#define SVE_OPAQUE_TYPE(InternalName, MangledName, Id, SingletonId) \ + case BuiltinType::Id: \ + type_name = MangledName; \ + Out << (type_name == InternalName ? "u" : "") << type_name.size() \ + << type_name; \ + break; #include "clang/Basic/AArch64SVEACLETypes.def" #define PPC_VECTOR_TYPE(Name, Id, Size) \ case BuiltinType::Id: \ diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -2337,6 +2337,8 @@ case BuiltinType::SveFloat64: case BuiltinType::SveBFloat16: case BuiltinType::SveBool: + case BuiltinType::SveBoolx2: + case BuiltinType::SveBoolx4: return true; default: return false; @@ -3198,6 +3200,13 @@ argSlot[i] = params[i]; } + // Propagate the SME ACLE attributes. + if (epi.AArch64SMEAttributes != SME_NormalFunction) { + auto &ExtraBits = *getTrailingObjects(); + ExtraBits.AArch64SMEAttributes = epi.AArch64SMEAttributes; + } + + // Fill in the exception type array if present. if (getExceptionSpecType() == EST_Dynamic) { auto &ExtraBits = *getTrailingObjects(); @@ -3391,6 +3400,8 @@ for (unsigned i = 0; i != NumParams; ++i) ID.AddInteger(epi.ExtParameterInfos[i].getOpaqueValue()); } + ID.AddInteger(epi.AArch64SMEAttributes); + epi.ExtInfo.Profile(ID); ID.AddBoolean(epi.HasTrailingReturn); } diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp --- a/clang/lib/AST/TypePrinter.cpp +++ b/clang/lib/AST/TypePrinter.cpp @@ -915,6 +915,24 @@ FunctionType::ExtInfo Info = T->getExtInfo(); + if ((T->getAArch64SMEAttributes() & + FunctionType::SME_PStateSMCompatibleMask) && + !InsideCCAttribute) + OS << " __attribute__((arm_streaming_compatible))"; + if ((T->getAArch64SMEAttributes() & FunctionType::SME_PStateSMEnabledMask) && + !InsideCCAttribute) + OS << " __attribute__((arm_streaming))"; + if ((T->getAArch64SMEAttributes() & FunctionType::SME_PStateZANewMask) && + !InsideCCAttribute) + OS << " __attribute__((arm_new_za))"; + if ((T->getAArch64SMEAttributes() & FunctionType::SME_PStateZASharedMask) && + !InsideCCAttribute) + OS << " __attribute__((arm_shared_za))"; + if ((T->getAArch64SMEAttributes() & + FunctionType::SME_PStateZAPreservedMask) && + !InsideCCAttribute) + OS << " __attribute__((arm_preserves_za))"; + printFunctionAfter(Info, OS); if (!T->getMethodQuals().empty()) @@ -1792,6 +1810,11 @@ break; } case attr::AArch64VectorPcs: OS << "aarch64_vector_pcs"; break; + case attr::ArmStreaming: OS << "arm_streaming"; break; + case attr::ArmStreamingCompatible: OS << "arm_streaming_compatible"; break; + case attr::ArmNewZA: OS << "arm_new_za"; break; + case attr::ArmPreservesZA: OS << "arm_preserves_za"; break; + case attr::ArmSharedZA: OS << "arm_shared_za"; break; case attr::AArch64SVEPcs: OS << "aarch64_sve_pcs"; break; case attr::AMDGPUKernelCall: OS << "amdgpu_kernel"; break; case attr::IntelOclBicc: OS << "inteloclbicc"; break; diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h --- a/clang/lib/Basic/Targets/AArch64.h +++ b/clang/lib/Basic/Targets/AArch64.h @@ -54,6 +54,8 @@ bool HasLSE; bool HasFlagM; bool HasMOPS; + bool HasSME; + bool HasSME2; bool HasRCPC; llvm::AArch64::ArchKind ArchKind; diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp --- a/clang/lib/Basic/Targets/AArch64.cpp +++ b/clang/lib/Basic/Targets/AArch64.cpp @@ -33,6 +33,10 @@ {#ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, nullptr}, #include "clang/Basic/BuiltinsSVE.def" +#define BUILTIN(ID, TYPE, ATTRS) \ + {#ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, nullptr}, +#include "clang/Basic/BuiltinsSME.def" + #define BUILTIN(ID, TYPE, ATTRS) \ {#ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, nullptr}, #define LANGBUILTIN(ID, TYPE, ATTRS, LANG) \ @@ -400,9 +404,8 @@ Builder.defineMacro("__ARM_FEATURE_BF16_SCALAR_ARITHMETIC", "1"); } - if ((FPU & SveMode) && HasBFloat16) { + if ((FPU & SveMode) && HasBFloat16) Builder.defineMacro("__ARM_FEATURE_SVE_BF16", "1"); - } if ((FPU & SveMode) && HasMatmulFP64) Builder.defineMacro("__ARM_FEATURE_SVE_MATMUL_FP64", "1"); @@ -528,6 +531,8 @@ .Cases("aarch64", "arm64", "arm", true) .Case("neon", FPU & NeonMode) .Cases("sve", "sve2", "sve2-bitperm", "sve2-aes", "sve2-sha3", "sve2-sm4", "f64mm", "f32mm", "i8mm", "bf16", FPU & SveMode) + .Case("sme", HasSME) + .Case("sme2", HasSME2) .Case("ls64", HasLS64) .Default(false); } @@ -575,6 +580,8 @@ HasMatmulFP32 = false; HasLSE = false; HasMOPS = false; + HasSME = false; + HasSME2 = false; HasRCPC = false; ArchKind = llvm::AArch64::ArchKind::INVALID; @@ -582,6 +589,19 @@ for (const auto &Feature : Features) { if (Feature == "+neon") FPU |= NeonMode; + if (Feature == "+sme") { + FPU |= SveMode; + HasSME = true; + HasBFloat16 = true; + HasFullFP16 = true; + } + if (Feature == "+sme2") { + FPU |= SveMode; + HasSME = true; + HasSME2 = true; + HasBFloat16 = true; + HasFullFP16 = true; + } if (Feature == "+sve") { FPU |= SveMode; HasFullFP16 = true; diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -6598,11 +6598,32 @@ #undef SVEMAP1 #undef SVEMAP2 +#define SMEMAP1(NameBase, LLVMIntrinsic, TypeModifier) \ + { \ +#NameBase, SME::BI__builtin_sme_##NameBase, Intrinsic::LLVMIntrinsic, 0, \ + TypeModifier \ + } + +#define SMEMAP2(NameBase, TypeModifier) \ + { \ +#NameBase, SME::BI__builtin_sme_##NameBase, 0, 0, TypeModifier \ + } + +static const ARMVectorIntrinsicInfo AArch64SMEIntrinsicMap[] = { +#define GET_SME_LLVM_INTRINSIC_MAP +#include "clang/Basic/arm_sme_builtin_cg.inc" +#undef GET_SME_LLVM_INTRINSIC_MAP +}; + +#undef SMEMAP1 +#undef SMEMAP2 + static bool NEONSIMDIntrinsicsProvenSorted = false; static bool AArch64SIMDIntrinsicsProvenSorted = false; static bool AArch64SISDIntrinsicsProvenSorted = false; static bool AArch64SVEIntrinsicsProvenSorted = false; +static bool AArch64SMEIntrinsicsProvenSorted = false; static const ARMVectorIntrinsicInfo * findARMVectorIntrinsicInMap(ArrayRef IntrinsicMap, @@ -8737,6 +8758,8 @@ return Builder.getInt32Ty(); case SVETypeFlags::MemEltTyInt64: return Builder.getInt64Ty(); + case SVETypeFlags::MemEltTyInt128: + return Builder.getInt128Ty(); } llvm_unreachable("Unknown MemEltType"); } @@ -8863,6 +8886,9 @@ // the elements of the specified datatype. Value *CodeGenFunction::EmitSVEPredicateCast(Value *Pred, llvm::ScalableVectorType *VTy) { + if (Pred->getType()->isAArch64SvcountTy()) + return Pred; + auto *RTy = llvm::VectorType::get(IntegerType::get(getLLVMContext(), 1), VTy); if (Pred->getType() == RTy) return Pred; @@ -8872,6 +8898,7 @@ switch (VTy->getMinNumElements()) { default: llvm_unreachable("unsupported element count!"); + case 1: case 2: case 4: case 8: @@ -9040,12 +9067,16 @@ unsigned N; switch (IntID) { case Intrinsic::aarch64_sve_ld2_sret: + case Intrinsic::aarch64_sve_ld1_pn_vg2: + case Intrinsic::aarch64_sve_ldnt1_pn_vg2: N = 2; break; case Intrinsic::aarch64_sve_ld3_sret: N = 3; break; case Intrinsic::aarch64_sve_ld4_sret: + case Intrinsic::aarch64_sve_ld1_pn_vg4: + case Intrinsic::aarch64_sve_ldnt1_pn_vg4: N = 4; break; default: @@ -9054,7 +9085,7 @@ auto RetTy = llvm::VectorType::get(VTy->getElementType(), VTy->getElementCount() * N); - Value *Predicate = EmitSVEPredicateCast(Ops[0], VTy); + Value *Predicate = EmitSVEPredicateCast(Ops[0], VTy); Value *BasePtr= Builder.CreateBitCast(Ops[1], VecPtrTy); Value *Offset = Ops.size() > 2 ? Ops[2] : Builder.getInt32(0); BasePtr = Builder.CreateGEP(VTy, BasePtr, Offset); @@ -9081,12 +9112,16 @@ unsigned N; switch (IntID) { case Intrinsic::aarch64_sve_st2: + case Intrinsic::aarch64_sve_st1_pn_vg2: + case Intrinsic::aarch64_sve_stnt1_pn_vg2: N = 2; break; case Intrinsic::aarch64_sve_st3: N = 3; break; case Intrinsic::aarch64_sve_st4: + case Intrinsic::aarch64_sve_st1_pn_vg4: + case Intrinsic::aarch64_sve_stnt1_pn_vg4: N = 4; break; default: @@ -9114,6 +9149,7 @@ return Builder.CreateCall(F, Operands); } + // SVE2's svpmullb and svpmullt builtins are similar to the svpmullb_pair and // svpmullt_pair intrinsics, with the exception that their results are bitcast // to a wider type. @@ -9280,6 +9316,193 @@ return {DefaultType}; } +static llvm::Value *getSMESliceIndex(CGBuilderTy &Builder, llvm::Value *Base, + llvm::Value *Offset) { + llvm::APInt OffsetVal = cast(Offset)->getValue().trunc(32); + Offset = ConstantInt::get(Base->getType(), OffsetVal); + return Builder.CreateAdd(Base, Offset); +} + +Value *CodeGenFunction::EmitSMELoadStore(const SVETypeFlags &TypeFlags, + unsigned BuiltinID, + SmallVectorImpl &Ops) { + // We support 3 classes of builtins here: + // 1. 4-op forms: + // svldr_vnum_za(slice_base, slice_offset, pn, ptr) + // svstr_vnum_za(slice_base, slice_offset, pn, ptr) + // 2. 5-op forms: + // svld1_(ver|hor)_zaX(tile, slice_base, slice_offset, pn, ptr) + // svst1_(ver|hor)_zaX(tile, slice_base, slice_offset, pn, ptr) + // 3. 6-op forms: + // svld1_(ver|hor)_vnum_zaX(tile, slice_base, slice_offset, pn, ptr, vnum) + // svst1_(ver|hor)_vnum_zaX(tile, slice_base, slice_offset, pn, ptr, vnum) + int NumOps = Ops.size(); + llvm::Value *Ptr = NumOps == 4 ? Ops[3] : Ops[4]; + + if (NumOps != 5) { + // For the 4-op form, slice_offset == vnum. + llvm::Value *VNum = NumOps == 4 ? Ops[1] : Ops[5]; + + Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sme_cntsb); + llvm::Value *SVL = Builder.CreateCall(F); + Value *Offset = + Builder.CreateMul(Builder.CreateZExtOrTrunc(VNum, SVL->getType()), SVL); + + llvm::Type *ByteType = Builder.getInt8Ty(); + Ptr = Builder.CreateGEP(ByteType, Ptr, Offset); + } + + Function *F = CGM.getIntrinsic(BuiltinID); + if (NumOps == 4) { + // Pn is not used at all in the intrinsic because the instruction doesn't + // use it either. This will need to be updated in the specification. + llvm::Value *Slice = getSMESliceIndex(Builder, Ops[0], Ops[1]); + return Builder.CreateCall(F, {Slice, Ptr}); + } + llvm::Value *Slice = getSMESliceIndex(Builder, Ops[1], Ops[2]); + auto *OverloadedTy = + getSVEVectorForElementType(SVEBuiltinMemEltTy(TypeFlags)); + llvm::Value *Pn = EmitSVEPredicateCast(Ops[3], OverloadedTy); + return Builder.CreateCall(F, {Pn, Ptr, /*Tile*/ Ops[0], Slice}); +} + +Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID, + const CallExpr *E) { + // Find out if any arguments are required to be integer constant expressions. + unsigned ICEArguments = 0; + ASTContext::GetBuiltinTypeError Error; + getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments); + assert(Error == ASTContext::GE_None && "Should not codegen an error"); + + llvm::SmallVector Ops; + for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) { + if ((ICEArguments & (1 << i)) == 0) { + Value *Arg = EmitScalarExpr(E->getArg(i)); + if (auto *VTy = dyn_cast(Arg->getType())) { + unsigned MinElts = VTy->getMinNumElements(); + bool IsPred = VTy->getElementType()->isIntegerTy(1); + unsigned N = + (MinElts * VTy->getScalarSizeInBits()) / (IsPred ? 16 : 128); + for (unsigned I = 0; I < N; ++I) { + Value *Idx = ConstantInt::get(CGM.Int64Ty, (I * MinElts) / N); + auto *NewVTy = + ScalableVectorType::get(VTy->getElementType(), MinElts / N); + Ops.push_back(Builder.CreateExtractVector(NewVTy, Arg, Idx)); + } + } else + Ops.push_back(Arg); + } else { + // If this is required to be a constant, constant fold it so that we know + // that the generated intrinsic gets a ConstantInt. + Optional Result = + E->getArg(i)->getIntegerConstantExpr(getContext()); + assert(Result && "Expected argument to be a constant"); + + // Immediates for SME llvm intrinsics are always 32bit. We can safely + // truncate because the immediate has been range checked and no valid + // immediate requires more than a handful of bits. + *Result = Result->extOrTrunc(32); + Ops.push_back(llvm::ConstantInt::get(getLLVMContext(), *Result)); + } + } + + auto *Builtin = findARMVectorIntrinsicInMap(AArch64SMEIntrinsicMap, BuiltinID, + AArch64SMEIntrinsicsProvenSorted); + SVETypeFlags TypeFlags(Builtin->TypeModifier); + if (TypeFlags.isLoad() || TypeFlags.isStore()) + return EmitSMELoadStore(TypeFlags, Builtin->LLVMIntrinsic, Ops); + + if (TypeFlags.isArmInStreamingMode()) { + // Create call to __arm_sme_state and extract lower bit from X0. + CallInst *CI = EmitRuntimeCall(CGM.CreateRuntimeFunction( + llvm::FunctionType::get(StructType::get(CGM.Int64Ty, CGM.Int64Ty), {}, + false), + "__arm_sme_state")); + auto Attrs = + AttributeList() + .addFnAttribute(getLLVMContext(), "aarch64_pstate_sm_compatible") + .addFnAttribute(getLLVMContext(), "aarch64_pstate_za_preserved"); + CI->setAttributes(Attrs); + CI->setCallingConv( + llvm::CallingConv:: + AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2); + Value *X0 = Builder.CreateExtractValue(CI, 0); + Value *X0Bit0 = Builder.CreateAnd(X0, Builder.getInt64(1)); + return Builder.CreateZExtOrTrunc(X0Bit0, ConvertType(E->getType())); + } + + // Should not happen! + if (Builtin->LLVMIntrinsic == 0) + return nullptr; + + if (TypeFlags.isZASliceBaseOffsetIntr()) { + llvm::Value *SliceBase = Ops[0]; + llvm::Value *SliceOff = Ops[1]; + llvm::Value *Slice = getSMESliceIndex(Builder, SliceBase, SliceOff); + Ops[0] = Slice; + Ops.erase(Ops.begin() + 1); + } + + // Predicates must match the main datatype. + for (unsigned i = 0, e = Ops.size(); i != e; ++i) + if (auto PredTy = dyn_cast(Ops[i]->getType())) + if (PredTy->getElementType()->isIntegerTy(1)) { + PredTy = getSVEType(TypeFlags); + Ops[i] = EmitSVEPredicateCast(Ops[i], PredTy); + } + + if (TypeFlags.isReadZA() || TypeFlags.isWriteZA()) { + llvm::ScalableVectorType *FnType = getSVEType(TypeFlags); + Function *F = CGM.getIntrinsic(Builtin->LLVMIntrinsic, {FnType}); + + // FIXME! We probably shouldn't be adding the slice_base (Op[1]) to the + // slice_offset (Op[2]) here. Instead I think it makes sense to change the + // LLVM intrinsic to match the ACLE intrinsic. + llvm::Value *SliceBase = TypeFlags.isReadZA() ? Ops[3] : Ops[1]; + llvm::Value *SliceOff = TypeFlags.isReadZA() ? Ops[4] : Ops[2]; + llvm::Value *Slice = getSMESliceIndex(Builder, SliceBase, SliceOff); + + if (TypeFlags.isReadZA()) + return Builder.CreateCall( + F, {/*Passthru*/ Ops[0], /*Pg*/ Ops[1], /*Tile*/ Ops[2], Slice}); + else + return Builder.CreateCall( + F, {/*Tile*/ Ops[0], Slice, /*Pg*/ Ops[3], /*Write data*/ Ops[4]}); + } + + Function *F = + TypeFlags.isOverloadNone() + ? CGM.getIntrinsic(Builtin->LLVMIntrinsic) + : CGM.getIntrinsic(Builtin->LLVMIntrinsic, {getSVEType(TypeFlags)}); + Value *Call = Builder.CreateCall(F, Ops); + + // Predicate results must be converted to svbool_t. + if (auto PredTy = dyn_cast(Call->getType())) + if (PredTy->getScalarType()->isIntegerTy(1)) + Call = EmitSVEPredicateCast(Call, ScalableVectorType::get(Builder.getInt1Ty(), 16)); + + // Multi-vector results should be broken up into a single (wide) result vector. + if (auto *StructTy = dyn_cast(Call->getType())) { + if (auto *VTy = dyn_cast(StructTy->getTypeAtIndex(0U))) { + unsigned N = StructTy->getNumElements(); + unsigned MinElts = VTy->getMinNumElements(); + ScalableVectorType *WideVTy = + ScalableVectorType::get(VTy->getElementType(), MinElts * N); + + Value *Ret = llvm::PoisonValue::get(WideVTy); + for (unsigned I = 0; I < N; ++I) { + Value *SRet = Builder.CreateExtractValue(Call, I); + assert(SRet->getType() == VTy && "Unexpected type for result value"); + Value *Idx = ConstantInt::get(CGM.Int64Ty, I * MinElts); + Ret = Builder.CreateInsertVector(WideVTy, Ret, SRet, Idx); + } + Call = Ret; + } + } + + return Call; +} + Value *CodeGenFunction::EmitSVETupleSetOrGet(const SVETypeFlags &TypeFlags, llvm::Type *Ty, ArrayRef Ops) { @@ -9654,6 +9877,10 @@ BuiltinID <= clang::AArch64::LastSVEBuiltin) return EmitAArch64SVEBuiltinExpr(BuiltinID, E); + if (BuiltinID >= AArch64::FirstSMEBuiltin && + BuiltinID <= AArch64::LastSMEBuiltin) + return EmitAArch64SMEBuiltinExpr(BuiltinID, E); + unsigned HintID = static_cast(-1); switch (BuiltinID) { default: break; diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -1765,6 +1765,17 @@ if (!isUnresolvedExceptionSpec(FPT->getExceptionSpecType()) && FPT->isNothrow()) FuncAttrs.addAttribute(llvm::Attribute::NoUnwind); + + if (FPT->getAArch64SMEAttributes() & FunctionType::SME_PStateSMEnabledMask) + FuncAttrs.addAttribute("aarch64_pstate_sm_enabled"); + if (FPT->getAArch64SMEAttributes() & FunctionType::SME_PStateSMCompatibleMask) + FuncAttrs.addAttribute("aarch64_pstate_sm_compatible"); + if (FPT->getAArch64SMEAttributes() & FunctionType::SME_PStateZANewMask) + FuncAttrs.addAttribute("aarch64_pstate_za_new"); + if (FPT->getAArch64SMEAttributes() & FunctionType::SME_PStateZASharedMask) + FuncAttrs.addAttribute("aarch64_pstate_za_shared"); + if (FPT->getAArch64SMEAttributes() & FunctionType::SME_PStateZAPreservedMask) + FuncAttrs.addAttribute("aarch64_pstate_za_preserved"); } static void AddAttributesFromAssumes(llvm::AttrBuilder &FuncAttrs, @@ -2216,6 +2227,24 @@ llvm::toStringRef(CodeGenOpts.UniformWGSize)); } } + + if (TargetDecl->hasAttr()) + FuncAttrs.addAttribute("aarch64_pstate_sm_enabled"); + + if (TargetDecl->hasAttr()) + FuncAttrs.addAttribute("aarch64_pstate_sm_body"); + + if (TargetDecl->hasAttr()) + FuncAttrs.addAttribute("aarch64_pstate_sm_compatible"); + + if (TargetDecl->hasAttr()) + FuncAttrs.addAttribute("aarch64_pstate_za_shared"); + + if (TargetDecl->hasAttr()) + FuncAttrs.addAttribute("aarch64_pstate_za_preserved"); + + if (TargetDecl->hasAttr()) + FuncAttrs.addAttribute("aarch64_pstate_za_new"); } // Attach "no-builtins" attributes to: diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp --- a/clang/lib/CodeGen/CGDebugInfo.cpp +++ b/clang/lib/CodeGen/CGDebugInfo.cpp @@ -722,7 +722,12 @@ #include "clang/Basic/AArch64SVEACLETypes.def" { ASTContext::BuiltinVectorTypeInfo Info = - CGM.getContext().getBuiltinVectorTypeInfo(BT); + // The debug info for svcount_t is the same as that for svbool_t + BT->getKind() == BuiltinType::SveCount + ? ASTContext::BuiltinVectorTypeInfo( + CGM.getContext().BoolTy, + llvm::ElementCount::getScalable(16), 1) + : CGM.getContext().getBuiltinVectorTypeInfo(BT); unsigned NumElemsPerVG = (Info.EC.getKnownMinValue() * Info.NumVectors) / 2; // Debuggers can't extract 1bit from a vector, so will display a diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -4243,6 +4243,10 @@ SmallVectorImpl &Ops, unsigned IntID); llvm::Value *EmitAArch64SVEBuiltinExpr(unsigned BuiltinID, const CallExpr *E); + llvm::Value *EmitAArch64SMEBuiltinExpr(unsigned BuiltinID, const CallExpr *E); + llvm::Value *EmitSMELoadStore(const SVETypeFlags &TypeFlags, + unsigned BuiltinID, + SmallVectorImpl &Ops); llvm::Value *EmitAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E, llvm::Triple::ArchType Arch); diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -1989,6 +1989,12 @@ return; } + // We only need to handle the 'arm_locally_streaming' attribute as a + // special case here (as opposed to e.g. 'arm_streaming'), because it + // is not set from the prototype, but rather from the definition. + if (D->hasAttr()) + B.addAttribute("aarch64_pstate_sm_body"); + // Track whether we need to add the optnone LLVM attribute, // starting with the default for this optimization level. bool ShouldAddOptNone = diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp --- a/clang/lib/CodeGen/CodeGenTypes.cpp +++ b/clang/lib/CodeGen/CodeGenTypes.cpp @@ -596,6 +596,8 @@ case BuiltinType::SveInt64x4: case BuiltinType::SveUint64x4: case BuiltinType::SveBool: + case BuiltinType::SveBoolx2: + case BuiltinType::SveBoolx4: case BuiltinType::SveFloat16: case BuiltinType::SveFloat16x2: case BuiltinType::SveFloat16x3: @@ -618,6 +620,8 @@ Info.EC.getKnownMinValue() * Info.NumVectors); } + case BuiltinType::SveCount: + return llvm::Type::getAArch64SvcountTy(getLLVMContext()); #define PPC_VECTOR_TYPE(Name, Id, Size) \ case BuiltinType::Id: \ ResultType = \ diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -315,6 +315,8 @@ clang_generate_header(-gen-arm-fp16 arm_fp16.td arm_fp16.h) # Generate arm_sve.h clang_generate_header(-gen-arm-sve-header arm_sve.td arm_sve.h) + # Generate arm_sme.h + clang_generate_header(-gen-arm-sme-header arm_sme.td arm_sme.h) # Generate arm_bf16.h clang_generate_header(-gen-arm-bf16 arm_bf16.td arm_bf16.h) # Generate arm_mve.h @@ -339,6 +341,7 @@ list(APPEND aarch64_only_generated_files "${CMAKE_CURRENT_BINARY_DIR}/arm_sve.h" + "${CMAKE_CURRENT_BINARY_DIR}/arm_sme.h" "${CMAKE_CURRENT_BINARY_DIR}/arm_bf16.h" "${output_dir}/arm_neon_sve_bridge.h" ) diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -2783,18 +2783,8 @@ llvm_unreachable("Invalid NeonTypeFlag!"); } -bool Sema::CheckSVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { - // Range check SVE intrinsics that take immediate values. - SmallVector, 3> ImmChecks; - - switch (BuiltinID) { - default: - return false; -#define GET_SVE_IMMEDIATE_CHECK -#include "clang/Basic/arm_sve_sema_rangechecks.inc" -#undef GET_SVE_IMMEDIATE_CHECK - } - +bool Sema::ParseSVEImmChecks( + CallExpr *TheCall, SmallVector, 3> &ImmChecks) { // Perform all the immediate checks for this builtin call. bool HasError = false; for (auto &I : ImmChecks) { @@ -2898,14 +2888,169 @@ if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 0, 3)) HasError = true; break; + case SVETypeFlags::ImmCheck0: + if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 0, 0)) + HasError = true; + break; + case SVETypeFlags::ImmCheck0_15: + if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 0, 15)) + HasError = true; + break; + case SVETypeFlags::ImmCheck0_255: + if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 0, 255)) + HasError = true; + break; + case SVETypeFlags::ImmCheck0_2_Mul2: + if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 0, 2) || + SemaBuiltinConstantArgMultiple(TheCall, ArgNum, 2)) + HasError = true; + break; + case SVETypeFlags::ImmCheck0_6_Mul2: + if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 0, 6) || + SemaBuiltinConstantArgMultiple(TheCall, ArgNum, 2)) + HasError = true; + break; + case SVETypeFlags::ImmCheck0_14_Mul2: + if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 0, 14) || + SemaBuiltinConstantArgMultiple(TheCall, ArgNum, 2)) + HasError = true; + break; + case SVETypeFlags::ImmCheck0_4_Mul4: + if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 0, 4) || + SemaBuiltinConstantArgMultiple(TheCall, ArgNum, 4)) + HasError = true; + break; + case SVETypeFlags::ImmCheck0_12_Mul4: + if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 0, 12) || + SemaBuiltinConstantArgMultiple(TheCall, ArgNum, 4)) + HasError = true; + break; } } - return HasError; } +enum ArmStreamingType { + ArmNonStreaming, + ArmStreaming, + ArmStreamingCompatible, + ArmLocallyStreaming +}; + +static ArmStreamingType getArmStreamingFnType(const FunctionDecl *FD) { + if (FD->hasAttr()) + return ArmLocallyStreaming; + if (const auto *T = dyn_cast(FD->getType())) { + if (T->getAArch64SMEAttributes() & FunctionType::SME_PStateSMEnabledMask) + return ArmStreaming; + if (T->getAArch64SMEAttributes() & FunctionType::SME_PStateSMCompatibleMask) + return ArmStreamingCompatible; + } + return ArmNonStreaming; +} + +static void checkArmStreamingBuiltin(Sema &S, CallExpr *TheCall, + ArmStreamingType FnType, + ArmStreamingType BuiltinType) { + assert(BuiltinType != ArmLocallyStreaming && + "Unexpected locally_streaming attribute for builtin!"); + if ((FnType == ArmStreaming || FnType == ArmLocallyStreaming) && + BuiltinType == ArmNonStreaming) + S.Diag(TheCall->getBeginLoc(), diag::warn_attribute_arm_sm_incompat_builtin) + << TheCall->getSourceRange() << "streaming or locally streaming"; + + if ((FnType == ArmStreamingCompatible) && + BuiltinType != ArmStreamingCompatible) { + S.Diag(TheCall->getBeginLoc(), diag::warn_attribute_arm_sm_incompat_builtin) + << TheCall->getSourceRange() << "streaming compatible"; + return; + } + + if (FnType == ArmNonStreaming && BuiltinType == ArmStreaming) + S.Diag(TheCall->getBeginLoc(), diag::warn_attribute_arm_sm_incompat_builtin) + << TheCall->getSourceRange() << "non-streaming"; +} + +bool Sema::CheckSMEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { + if (const FunctionDecl *FD = getCurFunctionDecl()) { + ArmStreamingType FnType = getArmStreamingFnType(FD); + Optional BuiltinType = None; + + switch (BuiltinID) { + default: + break; +#define GET_SME_STREAMING_ATTRS +#include "clang/Basic/arm_sme_streaming_attrs.inc" +#undef GET_SME_STREAMING_ATTRS + } + + if (BuiltinType) + checkArmStreamingBuiltin(*this, TheCall, FnType, *BuiltinType); + } + + // Range check SME intrinsics that take immediate values. + SmallVector, 3> ImmChecks; + + switch (BuiltinID) { + default: + return false; +#define GET_SME_IMMEDIATE_CHECK +#include "clang/Basic/arm_sme_sema_rangechecks.inc" +#undef GET_SME_IMMEDIATE_CHECK + } + + return ParseSVEImmChecks(TheCall, ImmChecks); +} + +bool Sema::CheckSVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { + if (const FunctionDecl *FD = getCurFunctionDecl()) { + ArmStreamingType SmType = getArmStreamingFnType(FD); + Optional BuiltinType = None; + + switch (BuiltinID) { + default: + break; +#define GET_SVE_STREAMING_ATTRS +#include "clang/Basic/arm_sve_streaming_attrs.inc" +#undef GET_SVE_STREAMING_ATTRS + } + + if (BuiltinType) + checkArmStreamingBuiltin(*this, TheCall, SmType, *BuiltinType); + } + + // Range check SVE intrinsics that take immediate values. + SmallVector, 3> ImmChecks; + + switch (BuiltinID) { + default: + return false; +#define GET_SVE_IMMEDIATE_CHECK +#include "clang/Basic/arm_sve_sema_rangechecks.inc" +#undef GET_SVE_IMMEDIATE_CHECK + } + + return ParseSVEImmChecks(TheCall, ImmChecks); +} + bool Sema::CheckNeonBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, CallExpr *TheCall) { + if (const FunctionDecl *FD = getCurFunctionDecl()) { + ArmStreamingType SmType = getArmStreamingFnType(FD); + Optional BuiltinType = None; + + switch (BuiltinID) { + default: + break; +#define GET_NEON_STREAMING_COMPAT_FLAG +#include "clang/Basic/arm_neon.inc" +#undef GET_NEON_STREAMING_COMPAT_FLAG + } + + if (BuiltinType) + checkArmStreamingBuiltin(*this, TheCall, SmType, *BuiltinType); + } + llvm::APSInt Result; uint64_t mask = 0; unsigned TV = 0; @@ -3266,6 +3411,9 @@ if (CheckSVEBuiltinFunctionCall(BuiltinID, TheCall)) return true; + if (CheckSMEBuiltinFunctionCall(BuiltinID, TheCall)) + return true; + // For intrinsics which take an immediate value as part of the instruction, // range check them here. unsigned i = 0, l = 0, u = 0; diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -3743,6 +3743,15 @@ } } + // It is not allowed to redeclare an SME function with different SME + // attributes. + if (IsInvalidSMECallConversion(Old->getType(), New->getType())) { + Diag(New->getLocation(), diag::err_sme_attr_mismatch) + << New->getType() << Old->getType(); + Diag(OldLocation, diag::note_previous_declaration); + return true; + } + // If a function is first declared with a calling convention, but is later // declared or defined without one, all following decls assume the calling // convention of the first. diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -275,6 +275,7 @@ if (const auto *A = D->getAttr()) { S.Diag(AL.getLoc(), diag::err_attributes_are_not_compatible) << AL << A; S.Diag(A->getLocation(), diag::note_conflicting_attribute); + AL.setInvalid(); return true; } return false; @@ -5585,6 +5586,14 @@ BuiltinID <= AArch64::LastSVEBuiltin; } +static bool ArmSmeAliasValid(ASTContext &Context, unsigned BuiltinID, + StringRef AliasName) { + if (Context.BuiltinInfo.isAuxBuiltinID(BuiltinID)) + BuiltinID = Context.BuiltinInfo.getAuxBuiltinID(BuiltinID); + return BuiltinID >= AArch64::FirstSMEBuiltin && + BuiltinID <= AArch64::LastSMEBuiltin; +} + static void handleArmBuiltinAliasAttr(Sema &S, Decl *D, const ParsedAttr &AL) { if (!AL.isArgIdent(0)) { S.Diag(AL.getLoc(), diag::err_attribute_argument_n_type) @@ -5597,7 +5606,8 @@ StringRef AliasName = cast(D)->getIdentifier()->getName(); bool IsAArch64 = S.Context.getTargetInfo().getTriple().isAArch64(); - if ((IsAArch64 && !ArmSveAliasValid(S.Context, BuiltinID, AliasName)) || + if ((IsAArch64 && !ArmSveAliasValid(S.Context, BuiltinID, AliasName) && + !ArmSmeAliasValid(S.Context, BuiltinID, AliasName)) || (!IsAArch64 && !ArmMveAliasValid(BuiltinID, AliasName) && !ArmCdeAliasValid(BuiltinID, AliasName))) { S.Diag(AL.getLoc(), diag::err_attribute_arm_builtin_alias); @@ -9203,6 +9213,10 @@ handleArmBuiltinAliasAttr(S, D, AL); break; + case ParsedAttr::AT_ArmLocallyStreaming: + handleSimpleAttribute(S, D, AL); + break; + case ParsedAttr::AT_AcquireHandle: handleAcquireHandleAttr(S, D, AL); break; diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -17660,6 +17660,14 @@ } } + // SME attributes must match when overriding a function declaration. + if (IsInvalidSMECallConversion(Old->getType(), New->getType())) { + Diag(New->getLocation(), diag::err_conflicting_overriding_attributes) + << New->getDeclName() << New->getType() << Old->getType(); + Diag(Old->getLocation(), diag::note_overridden_virtual_function); + return true; + } + // Virtual overrides must have the same code_seg. const auto *OldCSA = Old->getAttr(); const auto *NewCSA = New->getAttr(); diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -9216,6 +9216,20 @@ ColonLoc, result, VK, OK); } +// Check that the SME attributes for PSTATE.ZA and PSTATE.SM are compatible. +bool Sema::IsInvalidSMECallConversion(QualType FromType, QualType ToType) { + if (const auto *ToFn = + dyn_cast(Context.getCanonicalType(ToType))) + if (const auto *FromFn = + dyn_cast(Context.getCanonicalType(FromType))) + return (ToFn->getAArch64SMEAttributes() & + FunctionType::SME_AttributeMask) != + (FromFn->getAArch64SMEAttributes() & + FunctionType::SME_AttributeMask); + + return false; +} + // Check if we have a conversion between incompatible cmse function pointer // types, that is, a conversion between a function pointer with the // cmse_nonsecure_call attribute and one without. @@ -9374,6 +9388,8 @@ return Sema::IncompatibleFunctionPointer; if (IsInvalidCmseNSCallConversion(S, ltrans, rtrans)) return Sema::IncompatibleFunctionPointer; + if (S.IsInvalidSMECallConversion(ltrans, rtrans)) + return Sema::IncompatibleFunctionPointer; return ConvTy; } diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -138,6 +138,11 @@ case ParsedAttr::AT_NoReturn: \ case ParsedAttr::AT_Regparm: \ case ParsedAttr::AT_CmseNSCall: \ + case ParsedAttr::AT_ArmStreaming: \ + case ParsedAttr::AT_ArmStreamingCompatible: \ + case ParsedAttr::AT_ArmNewZA: \ + case ParsedAttr::AT_ArmSharedZA: \ + case ParsedAttr::AT_ArmPreservesZA: \ case ParsedAttr::AT_AnyX86NoCallerSavedRegisters: \ case ParsedAttr::AT_AnyX86NoCfCheck: \ CALLING_CONV_ATTRS_CASELIST @@ -7640,6 +7645,24 @@ llvm_unreachable("unexpected attribute kind!"); } +static bool checkMutualExclusion(TypeProcessingState &state, + const FunctionProtoType::ExtProtoInfo &EPI, + ParsedAttr &Attr, + AttributeCommonInfo::Kind OtherKind) { + auto OtherAttr = std::find_if( + state.getCurrentAttributes().begin(), state.getCurrentAttributes().end(), + [OtherKind](const ParsedAttr &A) { return A.getKind() == OtherKind; }); + if (OtherAttr == state.getCurrentAttributes().end() || OtherAttr->isInvalid()) + return false; + + Sema &S = state.getSema(); + S.Diag(Attr.getLoc(), diag::err_attributes_are_not_compatible) + << OtherAttr->getAttrName() << Attr; + S.Diag(OtherAttr->getLoc(), diag::note_conflicting_attribute); + Attr.setInvalid(); + return true; +} + /// Process an individual function attribute. Returns true to /// indicate that the attribute was handled, false if it wasn't. static bool handleFunctionTypeAttr(TypeProcessingState &state, ParsedAttr &attr, @@ -7769,6 +7792,65 @@ return true; } + if (attr.getKind() == ParsedAttr::AT_ArmStreaming || + attr.getKind() == ParsedAttr::AT_ArmStreamingCompatible || + attr.getKind() == ParsedAttr::AT_ArmNewZA || + attr.getKind() == ParsedAttr::AT_ArmSharedZA || + attr.getKind() == ParsedAttr::AT_ArmPreservesZA){ + if (S.CheckAttrTarget(attr) || S.CheckAttrNoArgs(attr)) + return true; + + if (!unwrapped.isFunctionType()) + return false; + + const FunctionProtoType *FnTy = unwrapped.get()->getAs(); + if (!FnTy) { + // SME ACLE attributes are not supported on K&R-style unprototyped C + // functions. + S.Diag(attr.getLoc(), diag::warn_attribute_ignored) << attr; + attr.setInvalid(); + return false; + } + + FunctionProtoType::ExtProtoInfo EPI = FnTy->getExtProtoInfo(); + switch (attr.getKind()) { + case ParsedAttr::AT_ArmStreaming: + if (checkMutualExclusion(state, EPI, attr, + ParsedAttr::AT_ArmStreamingCompatible)) + return true; + EPI.setArmSMEAttribute(FunctionType::SME_PStateSMEnabledMask); + break; + case ParsedAttr::AT_ArmStreamingCompatible: + if (checkMutualExclusion(state, EPI, attr, ParsedAttr::AT_ArmStreaming)) + return true; + EPI.setArmSMEAttribute(FunctionType::SME_PStateSMCompatibleMask); + break; + case ParsedAttr::AT_ArmNewZA: + if (checkMutualExclusion(state, EPI, attr, ParsedAttr::AT_ArmSharedZA) || + checkMutualExclusion(state, EPI, attr, ParsedAttr::AT_ArmPreservesZA)) + return true; + EPI.setArmSMEAttribute(FunctionType::SME_PStateZANewMask); + break; + case ParsedAttr::AT_ArmSharedZA: + if (checkMutualExclusion(state, EPI, attr, ParsedAttr::AT_ArmNewZA)) + return true; + EPI.setArmSMEAttribute(FunctionType::SME_PStateZASharedMask); + break; + case ParsedAttr::AT_ArmPreservesZA: + if (checkMutualExclusion(state, EPI, attr, ParsedAttr::AT_ArmNewZA)) + return true; + EPI.setArmSMEAttribute(FunctionType::SME_PStateZAPreservedMask); + break; + default: + llvm_unreachable("Unsupported attribute"); + } + + QualType newtype = S.Context.getFunctionType(FnTy->getReturnType(), + FnTy->getParamTypes(), EPI); + type = unwrapped.wrap(S, newtype->getAs()); + return true; + } + if (attr.getKind() == ParsedAttr::AT_NoThrow) { // Delay if this is not a function type. if (!unwrapped.isFunctionType()) diff --git a/clang/test/AST/ast-dump-aarch64-sve-types.c b/clang/test/AST/ast-dump-aarch64-sve-types.c --- a/clang/test/AST/ast-dump-aarch64-sve-types.c +++ b/clang/test/AST/ast-dump-aarch64-sve-types.c @@ -47,3 +47,6 @@ // CHECK: TypedefDecl {{.*}} implicit __SVBool_t '__SVBool_t' // CHECK-NEXT: -BuiltinType {{.*}} '__SVBool_t' + +// CHECK: TypedefDecl {{.*}} implicit __SVCount_t '__SVCount_t' +// CHECK-NEXT: -BuiltinType {{.*}} '__SVCount_t' diff --git a/clang/test/CodeGen/aarch64-debug-sve-vector-types.c b/clang/test/CodeGen/aarch64-debug-sve-vector-types.c --- a/clang/test/CodeGen/aarch64-debug-sve-vector-types.c +++ b/clang/test/CodeGen/aarch64-debug-sve-vector-types.c @@ -9,6 +9,9 @@ // CHECK-DAG: ![[REALELTS1_64]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 1, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus)) __SVBool_t b8; + // CHECK-DAG: name: "__SVCount_t",{{.*}}, baseType: ![[CT1]] + __SVCount_t c8; + // CHECK-DAG: name: "__SVInt8_t",{{.*}}, baseType: ![[CT8:[0-9]+]] // CHECK-DAG: ![[CT8]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTYS8:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS8:[0-9]+]]) // CHECK-DAG: ![[ELTTYS8]] = !DIBasicType(name: "signed char", size: 8, encoding: DW_ATE_signed_char) diff --git a/clang/test/CodeGen/aarch64-debug-sve-vectorx2-types.c b/clang/test/CodeGen/aarch64-debug-sve-vectorx2-types.c --- a/clang/test/CodeGen/aarch64-debug-sve-vectorx2-types.c +++ b/clang/test/CodeGen/aarch64-debug-sve-vectorx2-types.c @@ -64,4 +64,9 @@ // CHECK-DAG: ![[CT64]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY64:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS1x2_64]]) // CHECK-DAG: ![[ELTTY64]] = !DIBasicType(name: "double", size: 64, encoding: DW_ATE_float) __clang_svfloat64x2_t f64; + + // CHECK: name: "__clang_svboolx2_t", {{.*}}, baseType: ![[CT80:[0-9]+]]) + // CHECK-DAG: ![[CT80]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY8]], flags: DIFlagVector, elements: ![[ELTS1x2_64]]) + __clang_svboolx2_t i1_t2; + } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.c b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.c @@ -0,0 +1,195 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme \ +// RUN: -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s + +extern int normal_callee(); + +// == FUNCTION DECLARATIONS == + +__attribute__((arm_streaming)) int streaming_decl(void); +__attribute__((arm_streaming_compatible)) int streaming_compatible_decl(void); +__attribute__((arm_locally_streaming)) int locally_streaming_decl(void); +__attribute__((arm_shared_za)) int shared_za_decl(void); +__attribute__((arm_preserves_za)) int preserves_za_decl(void); +__attribute__((arm_new_za)) int new_za_decl(void); + +// == FUNCTION DEFINITIONS == + +// CHECK-LABEL: @streaming_caller({{.*}}#0 +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CALL:%.*]] = tail call i32 @normal_callee() #[[ATTR14:[0-9]+]] +// CHECK-NEXT: ret i32 [[CALL]] +// +__attribute__((arm_streaming)) int streaming_caller(void) { + return normal_callee(); +} + +// CHECK-LABEL: @streaming_callee({{.*}}#0 +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CALL:%.*]] = tail call i32 @streaming_decl() #[[ATTR15:[0-9]+]] +// CHECK-NEXT: ret i32 [[CALL]] +// +__attribute__((arm_streaming)) int streaming_callee(void) { + return streaming_decl(); +} + +// CHECK: declare i32 @streaming_decl(){{.*}}#2 + +// CHECK-LABEL: @streaming_compatible_caller({{.*}}#3 +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CALL:%.*]] = tail call i32 @normal_callee() #[[ATTR14]] +// CHECK-NEXT: ret i32 [[CALL]] +// +__attribute__((arm_streaming_compatible)) int streaming_compatible_caller(void) { + return normal_callee(); +} + +// CHECK-LABEL: @streaming_compatible_callee({{.*}}#3 +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CALL:%.*]] = tail call i32 @streaming_compatible_decl() #[[ATTR16:[0-9]+]] +// CHECK-NEXT: ret i32 [[CALL]] +// +__attribute__((arm_streaming_compatible)) int streaming_compatible_callee(void) { + return streaming_compatible_decl(); +} + +// CHECK: declare i32 @streaming_compatible_decl(){{.*}}#4 + +// CHECK-LABEL: @locally_streaming_caller({{.*}}#5 +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CALL:%.*]] = tail call i32 @normal_callee() #[[ATTR14]] +// CHECK-NEXT: ret i32 [[CALL]] +// +__attribute__((arm_locally_streaming)) int locally_streaming_caller(void) { + return normal_callee(); +} + +// CHECK-LABEL: @locally_streaming_callee({{.*}}#5 +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CALL:%.*]] = tail call i32 @locally_streaming_decl() #[[ATTR17:[0-9]+]] +// CHECK-NEXT: ret i32 [[CALL]] +// +__attribute__((arm_locally_streaming)) int locally_streaming_callee(void) { + return locally_streaming_decl(); +} + +// CHECK: declare i32 @locally_streaming_decl(){{.*}}#6 + +// CHECK-LABEL: @shared_za_caller({{.*}}#7 +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CALL:%.*]] = tail call i32 @normal_callee() #[[ATTR14]] +// CHECK-NEXT: ret i32 [[CALL]] +// +__attribute__((arm_shared_za)) int shared_za_caller(void) { + return normal_callee(); +} + +// CHECK-LABEL: @shared_za_callee({{.*}}#7 +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CALL:%.*]] = tail call i32 @shared_za_decl() #[[ATTR18:[0-9]+]] +// CHECK-NEXT: ret i32 [[CALL]] +// +__attribute__((arm_shared_za)) int shared_za_callee(void) { + return shared_za_decl(); +} + +// CHECK: declare i32 @shared_za_decl(){{.*}}#8 + +// CHECK-LABEL: @preserves_za_caller({{.*}}#9 +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CALL:%.*]] = tail call i32 @normal_callee() #[[ATTR14]] +// CHECK-NEXT: ret i32 [[CALL]] +// +__attribute__((arm_preserves_za)) int preserves_za_caller(void) { + return normal_callee(); +} + +// CHECK-LABEL: @preserves_za_callee({{.*}}#9 +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CALL:%.*]] = tail call i32 @preserves_za_decl() #[[ATTR19:[0-9]+]] +// CHECK-NEXT: ret i32 [[CALL]] +// +__attribute__((arm_preserves_za)) int preserves_za_callee(void) { + return preserves_za_decl(); +} + +// CHECK: declare i32 @preserves_za_decl(){{.*}}#10 + +// CHECK-LABEL: @new_za_caller({{.*}}#11 +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CALL:%.*]] = tail call i32 @normal_callee() #[[ATTR14]] +// CHECK-NEXT: ret i32 [[CALL]] +// +__attribute__((arm_new_za)) int new_za_caller(void) { + return normal_callee(); +} + +// CHECK-LABEL: @new_za_callee({{.*}}#11 +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CALL:%.*]] = tail call i32 @new_za_decl() #[[ATTR20:[0-9]+]] +// CHECK-NEXT: ret i32 [[CALL]] +// +__attribute__((arm_new_za)) int new_za_callee(void) { + return new_za_decl(); +} + +// CHECK: declare i32 @new_za_decl(){{.*}}#12 + +// Ensure that the attributes are correctly propagated to function types +// and also to callsites. +typedef void __attribute__((arm_streaming)) (*s_ptrty) (int, int); +typedef void __attribute__((arm_streaming_compatible)) (*sc_ptrty) (int, int); +typedef void __attribute__((arm_new_za)) (*nz_ptrty) (int, int); +typedef void __attribute__((arm_shared_za)) (*sz_ptrty) (int, int); +typedef void __attribute__((arm_preserves_za)) (*pz_ptrty) (int, int); + +// CHECK-LABEL: @test_streaming_ptrty( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void [[F:%.*]](i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]] +// CHECK-NEXT: ret void +// +void test_streaming_ptrty(s_ptrty f, int x, int y) { return f(x, y); } +// CHECK-LABEL: @test_streaming_compatible_ptrty( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void [[F:%.*]](i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR16]] +// CHECK-NEXT: ret void +// +void test_streaming_compatible_ptrty(sc_ptrty f, int x, int y) { return f(x, y); } +// CHECK-LABEL: @test_new_za( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void [[F:%.*]](i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR20]] +// CHECK-NEXT: ret void +// +void __attribute__((arm_shared_za)) test_new_za(nz_ptrty f, int x, int y) { return f(x, y); } +// CHECK-LABEL: @test_shared_za( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void [[F:%.*]](i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR18]] +// CHECK-NEXT: ret void +// +void __attribute__((arm_shared_za)) test_shared_za(sz_ptrty f, int x, int y) { return f(x, y); } +// CHECK-LABEL: @test_preserved_za( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void [[F:%.*]](i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR19]] +// CHECK-NEXT: ret void +// +void __attribute__((arm_shared_za)) test_preserved_za(pz_ptrty f, int x, int y) { return f(x, y); } + + +// CHECK: attributes #0 = { {{.*}}"aarch64_pstate_sm_enabled"{{.*}} } +// CHECK: attributes #2 = { {{.*}}"aarch64_pstate_sm_enabled"{{.*}} } +// CHECK: attributes #3 = { {{.*}}"aarch64_pstate_sm_compatible"{{.*}} } +// CHECK: attributes #4 = { {{.*}}"aarch64_pstate_sm_compatible"{{.*}} } +// CHECK: attributes #5 = { {{.*}}"aarch64_pstate_sm_body"{{.*}} } +// CHECK: attributes #6 = { {{.*}}"aarch64_pstate_sm_body"{{.*}} } +// CHECK: attributes #7 = { {{.*}}"aarch64_pstate_za_shared"{{.*}} } +// CHECK: attributes #8 = { {{.*}}"aarch64_pstate_za_shared"{{.*}} } +// CHECK: attributes #9 = { {{.*}}"aarch64_pstate_za_preserved"{{.*}} } +// CHECK: attributes #10 = { {{.*}}"aarch64_pstate_za_preserved"{{.*}} } +// CHECK: attributes #11 = { {{.*}}"aarch64_pstate_za_new"{{.*}} } +// CHECK: attributes #12 = { {{.*}}"aarch64_pstate_za_new"{{.*}} } +// CHECK: attributes #15 = { {{.*}}"aarch64_pstate_sm_enabled"{{.*}} } +// CHECK: attributes #16 = { {{.*}}"aarch64_pstate_sm_compatible"{{.*}} } +// CHECK: attributes #17 = { {{.*}}"aarch64_pstate_sm_body"{{.*}} } +// CHECK: attributes #18 = { {{.*}}"aarch64_pstate_za_shared"{{.*}} } +// CHECK: attributes #19 = { {{.*}}"aarch64_pstate_za_preserved"{{.*}} } +// CHECK: attributes #20 = { {{.*}}"aarch64_pstate_za_new"{{.*}} } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp @@ -0,0 +1,280 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme \ +// RUN: -S -O0 -Werror -emit-llvm -o - %s | FileCheck %s + +extern "C" { + +extern int normal_callee(); + +// == FUNCTION DECLARATIONS == + +__attribute__((arm_streaming)) int streaming_decl(); +__attribute__((arm_streaming_compatible)) int streaming_compatible_decl(); +__attribute__((arm_shared_za)) int shared_za_decl(); +__attribute__((arm_preserves_za)) int preserves_za_decl(); +__attribute__((arm_new_za)) int new_za_decl(); + +// == FUNCTION DEFINITIONS == + +// CHECK-LABEL: @streaming_caller() +// CHECK-SAME: #[[SM_ENABLED:[0-9]+]] +// CHECK: call i32 @normal_callee() +// +__attribute__((arm_streaming)) int streaming_caller() { + return normal_callee(); +} + +// CHECK: declare i32 @normal_callee() #[[NORMAL_DECL:[0-9]+]] + + +// CHECK-LABEL: @streaming_callee() +// CHECK-SAME: #[[SM_ENABLED]] +// CHECK: call i32 @streaming_decl() #[[SM_ENABLED_CALL:[0-9]+]] +// +__attribute__((arm_streaming)) int streaming_callee() { + return streaming_decl(); +} + +// CHECK: declare i32 @streaming_decl() #[[SM_ENABLED_DECL:[0-9]+]] + +// CHECK-LABEL: @streaming_compatible_caller() +// CHECK-SAME: #[[SM_COMPATIBLE:[0-9]+]] +// CHECK: call i32 @normal_callee() +// +__attribute__((arm_streaming_compatible)) int streaming_compatible_caller() { + return normal_callee(); +} + +// CHECK-LABEL: @streaming_compatible_callee() +// CHECK-SAME: #[[SM_COMPATIBLE]] +// CHECK: call i32 @streaming_compatible_decl() #[[SM_COMPATIBLE_CALL:[0-9]+]] +// +__attribute__((arm_streaming_compatible)) int streaming_compatible_callee() { + return streaming_compatible_decl(); +} + +// CHECK: declare i32 @streaming_compatible_decl() #[[SM_COMPATIBLE_DECL:[0-9]+]] + +// CHECK-LABEL: @locally_streaming_caller() +// CHECK-SAME: #[[SM_BODY:[0-9]+]] +// CHECK: call i32 @normal_callee() +// +__attribute__((arm_locally_streaming)) int locally_streaming_caller() { + return normal_callee(); +} + +// CHECK-LABEL: @locally_streaming_callee() +// CHECK-SAME: #[[SM_BODY]] +// CHECK: call i32 @locally_streaming_caller() #[[SM_BODY_CALL:[0-9]+]] +// +__attribute__((arm_locally_streaming)) int locally_streaming_callee() { + return locally_streaming_caller(); +} + + +// CHECK-LABEL: @shared_za_caller() +// CHECK-SAME: #[[ZA_SHARED:[0-9]+]] +// CHECK: call i32 @normal_callee() +// +__attribute__((arm_shared_za)) int shared_za_caller() { + return normal_callee(); +} + +// CHECK-LABEL: @shared_za_callee() +// CHECK-SAME: #[[ZA_SHARED]] +// CHECK: call i32 @shared_za_decl() #[[ZA_SHARED_CALL:[0-9]+]] +// +__attribute__((arm_shared_za)) int shared_za_callee() { + return shared_za_decl(); +} + +// CHECK: declare i32 @shared_za_decl() #[[ZA_SHARED_DECL:[0-9]+]] + + +// CHECK-LABEL: @preserves_za_caller() +// CHECK-SAME: #[[ZA_PRESERVED:[0-9]+]] +// CHECK: call i32 @normal_callee() +// +__attribute__((arm_preserves_za)) int preserves_za_caller() { + return normal_callee(); +} + +// CHECK-LABEL: @preserves_za_callee() +// CHECK-SAME: #[[ZA_PRESERVED]] +// CHECK: call i32 @preserves_za_decl() #[[ZA_PRESERVED_CALL:[0-9]+]] +// +__attribute__((arm_preserves_za)) int preserves_za_callee() { + return preserves_za_decl(); +} + +// CHECK: declare i32 @preserves_za_decl() #[[ZA_PRESERVED_DECL:[0-9]+]] + + +// CHECK-LABEL: @new_za_caller() +// CHECK-SAME: #[[ZA_NEW:[0-9]+]] +// CHECK: call i32 @normal_callee() +// +__attribute__((arm_new_za)) int new_za_caller() { + return normal_callee(); +} + +// CHECK-LABEL: @new_za_callee() +// CHECK-SAME: #[[ZA_NEW]] +// CHECK: call i32 @new_za_decl() #[[ZA_NEW_CALL:[0-9]+]] +// +__attribute__((arm_new_za)) int new_za_callee() { + return new_za_decl(); +} + +// CHECK: declare i32 @new_za_decl() #[[ZA_NEW_DECL:[0-9]+]] + + +// Ensure that the attributes are correctly propagated to function types +// and also to callsites. +typedef void __attribute__((arm_streaming)) (*s_ptrty) (int, int); +typedef void __attribute__((arm_streaming_compatible)) (*sc_ptrty) (int, int); +typedef void __attribute__((arm_new_za)) (*nz_ptrty) (int, int); +typedef void __attribute__((arm_shared_za)) (*sz_ptrty) (int, int); +typedef void __attribute__((arm_preserves_za)) (*pz_ptrty) (int, int); + +// CHECK-LABEL: @test_streaming_ptrty( +// CHECK-SAME: #[[NORMAL_DEF:[0-9]+]] +// CHECK: call void [[F:%.*]](i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[SM_ENABLED_CALL]] +// +void test_streaming_ptrty(s_ptrty f, int x, int y) { return f(x, y); } +// CHECK-LABEL: @test_streaming_compatible_ptrty( +// CHECK-SAME: #[[NORMAL_DEF]] +// CHECK: call void [[F:%.*]](i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[SM_COMPATIBLE_CALL]] +// +void test_streaming_compatible_ptrty(sc_ptrty f, int x, int y) { return f(x, y); } +// CHECK-LABEL: @test_new_za( +// CHECK-SAME: #[[ZA_SHARED]] +// CHECK: call void [[F:%.*]](i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ZA_NEW_CALL]] +// +void __attribute__((arm_shared_za)) test_new_za(nz_ptrty f, int x, int y) { return f(x, y); } +// CHECK-LABEL: @test_shared_za( +// CHECK-SAME: #[[ZA_SHARED]] +// CHECK: call void [[F:%.*]](i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ZA_SHARED_CALL]] +// +void __attribute__((arm_shared_za)) test_shared_za(sz_ptrty f, int x, int y) { return f(x, y); } +// CHECK-LABEL: @test_preserved_za( +// CHECK-SAME: #[[ZA_SHARED]] +// CHECK: call void [[F:%.*]](i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ZA_PRESERVED_CALL]] +// +void __attribute__((arm_shared_za)) test_preserved_za(pz_ptrty f, int x, int y) { return f(x, y); } + +// CHECK-LABEL: @test_indirect_streaming_ptrty( +// CHECK-SAME: #[[NORMAL_DEF:[0-9]+]] +// CHECK: call void [[F:%.*]](i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[SM_ENABLED_CALL]] +// +typedef s_ptrty **indirect_s_ptrty; +void test_indirect_streaming_ptrty(indirect_s_ptrty fptr, int x, int y) { return (**fptr)(x, y); } +} // extern "C" + +// +// Test that having the attribute in different places (on declaration and on type) +// both results in the attribute being applied to the type. +// + +// CHECK-LABEL: @_Z24test_same_type_streamingv( +// CHECK: call void @_Z10streaming1v() #[[SM_ENABLED_CALL]] +// CHECK: call void @_Z10streaming2v() #[[SM_ENABLED_CALL]] +// CHECK: call void @_Z10streaming3v() #[[SM_ENABLED_CALL]] +// CHECK: call void @_Z20same_type_streaming1v() #[[SM_ENABLED_CALL]] +// CHECK: call void @_Z20same_type_streaming2v() #[[SM_ENABLED_CALL]] +// CHECK: call void @_Z20same_type_streaming3v() #[[SM_ENABLED_CALL]] +// CHECK: ret void +// CHECK: } +// CHECK: declare void @_Z10streaming1v() #[[SM_ENABLED_DECL]] +// CHECK: declare void @_Z10streaming2v() #[[SM_ENABLED_DECL]] +// CHECK: declare void @_Z10streaming3v() #[[SM_ENABLED_DECL]] +// CHECK: declare void @_Z20same_type_streaming1v() #[[SM_ENABLED_DECL]] +// CHECK: declare void @_Z20same_type_streaming2v() #[[SM_ENABLED_DECL]] +// CHECK: declare void @_Z20same_type_streaming3v() #[[SM_ENABLED_DECL]] +__attribute__((arm_streaming)) void streaming1(); +void __attribute__((arm_streaming)) streaming2(); +void streaming3() __attribute__((arm_streaming)); +decltype(streaming1) same_type_streaming1; +decltype(streaming2) same_type_streaming2; +decltype(streaming3) same_type_streaming3; +void test_same_type_streaming() { + streaming1(); + streaming2(); + streaming3(); + same_type_streaming1(); + same_type_streaming2(); + same_type_streaming3(); +} + +// +// Test overloading; the attribute is not required for overloaded types and +// does not apply if not specified. +// + +// CHECK-LABEL: @_Z12overloadedfni( +// CHECK-SAME: #[[SM_ENABLED]] +int __attribute__((arm_streaming)) overloadedfn(int x) { return x; } +// CHECK-LABEL: @_Z12overloadedfnf( +// CHECK-SAME: #[[NORMAL_DEF]] +// +float overloadedfn(float x) { return x; } +// CHECK-LABEL: @_Z13test_overloadi( +// CHECK-SAME: #[[NORMAL_DEF]] +// +int test_overload(int x) { return overloadedfn(x); } +// CHECK-LABEL: @_Z13test_overloadf( +// CHECK-SAME: #[[NORMAL_DEF]] +// +float test_overload(float x) { return overloadedfn(x); } + +// CHECK-LABEL: @_Z11test_lambdai( +// CHECK-SAME: #[[NORMAL_DEF]] +// CHECK: call noundef i32 @"_ZZ11test_lambdaiENK3$_0clEi"({{.*}}) #[[SM_ENABLED_CALL]] +// +// CHECK: @"_ZZ11test_lambdaiENK3$_0clEi"( +// CHECK-SAME: #[[SM_ENABLED]] +int test_lambda(int x) { + auto F = [](int x) __attribute__((arm_streaming)) { return x; }; + return F(x); +} + +// CHECK-LABEL: @_Z27test_template_instantiationv( +// CHECK-SAME: #[[NORMAL_DEF]] +// CHECK: call noundef i32 @_Z15template_functyIiET_S0_(i32 noundef 12) #[[SM_ENABLED_CALL]] +// +// CHECK: @_Z15template_functyIiET_S0_( +// CHECK-SAME: #[[SM_ENABLED]] +template +Ty template_functy(Ty x) __attribute__((arm_streaming)) { return x; } +int test_template_instantiation() { return template_functy(12); } + +// +// Test that arm_locally_streaming is inherited by future redeclarations, +// even when they don't specify the attribute. +// + +// CHECK: define {{.*}} @_Z25locally_streaming_inheritv( +// CHECK-SAME: #[[SM_BODY]] +__attribute__((arm_locally_streaming)) void locally_streaming_inherit(); +void locally_streaming_inherit() { + streaming_decl(); +} + +// CHECK: attributes #[[SM_ENABLED]] = { mustprogress noinline nounwind optnone vscale_range(1,16) "aarch64_pstate_sm_enabled" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sme" } +// CHECK: attributes #[[NORMAL_DECL]] = { "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sme" } +// CHECK: attributes #[[SM_ENABLED_DECL]] = { "aarch64_pstate_sm_enabled" "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sme" } +// CHECK: attributes #[[SM_COMPATIBLE]] = { mustprogress noinline nounwind optnone vscale_range(1,16) "aarch64_pstate_sm_compatible" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sme" } +// CHECK: attributes #[[SM_COMPATIBLE_DECL]] = { "aarch64_pstate_sm_compatible" "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sme" } +// CHECK: attributes #[[SM_BODY]] = { mustprogress noinline nounwind optnone vscale_range(1,16) "aarch64_pstate_sm_body" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sme" } +// CHECK: attributes #[[ZA_SHARED]] = { mustprogress noinline nounwind optnone vscale_range(1,16) "aarch64_pstate_za_shared" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sme" } +// CHECK: attributes #[[ZA_SHARED_DECL]] = { "aarch64_pstate_za_shared" "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sme" } +// CHECK: attributes #[[ZA_PRESERVED]] = { mustprogress noinline nounwind optnone vscale_range(1,16) "aarch64_pstate_za_preserved" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sme" } +// CHECK: attributes #[[ZA_PRESERVED_DECL]] = { "aarch64_pstate_za_preserved" "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sme" } +// CHECK: attributes #[[ZA_NEW]] = { mustprogress noinline nounwind optnone vscale_range(1,16) "aarch64_pstate_za_new" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sme" } +// CHECK: attributes #[[ZA_NEW_DECL]] = { "aarch64_pstate_za_new" "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sme" } +// CHECK: attributes #[[NORMAL_DEF]] = { mustprogress noinline nounwind optnone vscale_range(1,16) "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sme" } +// CHECK: attributes #[[SM_ENABLED_CALL]] = { "aarch64_pstate_sm_enabled" } +// CHECK: attributes #[[SM_COMPATIBLE_CALL]] = { "aarch64_pstate_sm_compatible" } +// CHECK: attributes #[[SM_BODY_CALL]] = { "aarch64_pstate_sm_body" } +// CHECK: attributes #[[ZA_SHARED_CALL]] = { "aarch64_pstate_za_shared" } +// CHECK: attributes #[[ZA_PRESERVED_CALL]] = { "aarch64_pstate_za_preserved" } +// CHECK: attributes #[[ZA_NEW_CALL]] = { "aarch64_pstate_za_new" } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add.c @@ -0,0 +1,180 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -D__ARM_FEATURE_SME_I64I64=1 -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -D__ARM_FEATURE_SME_I64I64=1 -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK + +#include + +// == HORIZONTAL == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svaddha_za32_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svaddha_za32_s32u10__SVBool_tu10__SVBool_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svaddha_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) { + svaddha_za32_s32(3, pn, pm, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svaddha_za32_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svaddha_za32_u32u10__SVBool_tu10__SVBool_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svaddha_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) { + svaddha_za32_u32(3, pn, pm, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svaddha_za64_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svaddha_za64_s64u10__SVBool_tu10__SVBool_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svaddha_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) { + svaddha_za64_s64(3, pn, pm, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svaddha_za64_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svaddha_za64_u64u10__SVBool_tu10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svaddha_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) { + svaddha_za64_u64(3, pn, pm, zn); +} + +// == VERTICAL == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svaddva_za32_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svaddva_za32_s32u10__SVBool_tu10__SVBool_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svaddva_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) { + svaddva_za32_s32(3, pn, pm, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svaddva_za32_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svaddva_za32_u32u10__SVBool_tu10__SVBool_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svaddva_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) { + svaddva_za32_u32(3, pn, pm, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svaddva_za64_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svaddva_za64_s64u10__SVBool_tu10__SVBool_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svaddva_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) { + svaddva_za64_s64(3, pn, pm, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svaddva_za64_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svaddva_za64_u64u10__SVBool_tu10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svaddva_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) { + svaddva_za64_u64(3, pn, pm, zn); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c @@ -0,0 +1,66 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK + +#include + +// CHECK-LABEL: @test_svcntsb( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: ret i64 [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z12test_svcntsbv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: ret i64 [[TMP0]] +// +__attribute__((arm_streaming_compatible)) +uint64_t test_svcntsb(void) { + return svcntsb(); +} + +// CHECK-LABEL: @test_svcntsh( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsh() +// CHECK-NEXT: ret i64 [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z12test_svcntshv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsh() +// CPP-CHECK-NEXT: ret i64 [[TMP0]] +// +__attribute__((arm_streaming_compatible)) +uint64_t test_svcntsh(void) { + return svcntsh(); +} + +// CHECK-LABEL: @test_svcntsw( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsw() +// CHECK-NEXT: ret i64 [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z12test_svcntswv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsw() +// CPP-CHECK-NEXT: ret i64 [[TMP0]] +// +__attribute__((arm_streaming_compatible)) +uint64_t test_svcntsw(void) { + return svcntsw(); +} + +// CHECK-LABEL: @test_svcntsd( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsd() +// CHECK-NEXT: ret i64 [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z12test_svcntsdv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsd() +// CPP-CHECK-NEXT: ret i64 [[TMP0]] +// +__attribute__((arm_streaming_compatible)) +uint64_t test_svcntsd(void) { + return svcntsd(); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_loads.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_loads.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_loads.c @@ -0,0 +1,498 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK + +#include + +// == HORIZONTAL == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_hor_za8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz( [[PG:%.*]], ptr [[BASE:%.*]], i32 0, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z18test_svld1_hor_za8ju10__SVBool_tPKa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz( [[PG:%.*]], ptr [[BASE:%.*]], i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_za8(uint32_t slice_base, svbool_t pg, const int8_t *base) { + svld1_hor_za8(0, slice_base, 15, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_hor_za16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 1, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svld1_hor_za16ju10__SVBool_tPKs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_za16(uint32_t slice_base, svbool_t pg, const int16_t *base) { + svld1_hor_za16(1, slice_base, 7, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_hor_za32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 3, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svld1_hor_za32ju10__SVBool_tPKi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 3, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_za32(uint32_t slice_base, svbool_t pg, const int32_t *base) { + svld1_hor_za32(3, slice_base, 3, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_hor_za64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 7, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svld1_hor_za64ju10__SVBool_tPKl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 7, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_za64(uint32_t slice_base, svbool_t pg, const int64_t *base) { + svld1_hor_za64(7, slice_base, 1, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_hor_za128( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz( [[TMP0]], ptr [[BASE:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svld1_hor_za128ju10__SVBool_tPKn( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz( [[TMP0]], ptr [[BASE:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_za128(uint32_t slice_base, svbool_t pg, const __int128_t *base) { + svld1_hor_za128(15, slice_base, 0, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_hor_vnum_za8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz( [[PG:%.*]], ptr [[TMP2]], i32 0, i32 [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svld1_hor_vnum_za8ju10__SVBool_tPKa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz( [[PG:%.*]], ptr [[TMP2]], i32 0, i32 [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, const int8_t *base) { + svld1_hor_vnum_za8(0, slice_base, 15, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_hor_vnum_za16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz( [[TMP4]], ptr [[TMP2]], i32 1, i32 [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svld1_hor_vnum_za16ju10__SVBool_tPKs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz( [[TMP4]], ptr [[TMP2]], i32 1, i32 [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, const int16_t *base) { + svld1_hor_vnum_za16(1, slice_base, 7, pg, base, 3); +} + +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_hor_vnum_za32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz( [[TMP4]], ptr [[TMP2]], i32 3, i32 [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svld1_hor_vnum_za32ju10__SVBool_tPKi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz( [[TMP4]], ptr [[TMP2]], i32 3, i32 [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, const int32_t *base) { + svld1_hor_vnum_za32(3, slice_base, 3, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_hor_vnum_za64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz( [[TMP4]], ptr [[TMP2]], i32 7, i32 [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svld1_hor_vnum_za64ju10__SVBool_tPKl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz( [[TMP4]], ptr [[TMP2]], i32 7, i32 [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, const int64_t *base) { + svld1_hor_vnum_za64(7, slice_base, 1, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_hor_vnum_za128( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz( [[TMP3]], ptr [[TMP2]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svld1_hor_vnum_za128ju10__SVBool_tPKn( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz( [[TMP3]], ptr [[TMP2]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, const __int128_t *base) { + svld1_hor_vnum_za128(15, slice_base, 0, pg, base, 3); +} + +// == VERTICAL == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_ver_za8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert( [[PG:%.*]], ptr [[BASE:%.*]], i32 0, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z18test_svld1_ver_za8ju10__SVBool_tPKa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert( [[PG:%.*]], ptr [[BASE:%.*]], i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_za8(uint32_t slice_base, svbool_t pg, const int8_t *base) { + svld1_ver_za8(0, slice_base, 15, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_ver_za16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert( [[TMP1]], ptr [[BASE:%.*]], i32 1, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svld1_ver_za16ju10__SVBool_tPKs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert( [[TMP1]], ptr [[BASE:%.*]], i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_za16(uint32_t slice_base, svbool_t pg, const int16_t *base) { + svld1_ver_za16(1, slice_base, 7, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_ver_za32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert( [[TMP1]], ptr [[BASE:%.*]], i32 3, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svld1_ver_za32ju10__SVBool_tPKi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert( [[TMP1]], ptr [[BASE:%.*]], i32 3, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_za32(uint32_t slice_base, svbool_t pg, const int32_t *base) { + svld1_ver_za32(3, slice_base, 3, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_ver_za64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert( [[TMP1]], ptr [[BASE:%.*]], i32 7, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svld1_ver_za64ju10__SVBool_tPKl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert( [[TMP1]], ptr [[BASE:%.*]], i32 7, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_za64(uint32_t slice_base, svbool_t pg, const int64_t *base) { + svld1_ver_za64(7, slice_base, 1, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_ver_za128( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert( [[TMP0]], ptr [[BASE:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svld1_ver_za128ju10__SVBool_tPKn( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert( [[TMP0]], ptr [[BASE:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_za128(uint32_t slice_base, svbool_t pg, const __int128_t *base) { + svld1_ver_za128(15, slice_base, 0, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_ver_vnum_za8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert( [[PG:%.*]], ptr [[TMP2]], i32 0, i32 [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svld1_ver_vnum_za8ju10__SVBool_tPKa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert( [[PG:%.*]], ptr [[TMP2]], i32 0, i32 [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_vnum_za8(uint32_t slice_base, svbool_t pg, const int8_t *base) { + svld1_ver_vnum_za8(0, slice_base, 15, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_ver_vnum_za16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert( [[TMP4]], ptr [[TMP2]], i32 1, i32 [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svld1_ver_vnum_za16ju10__SVBool_tPKs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert( [[TMP4]], ptr [[TMP2]], i32 1, i32 [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, const int16_t *base) { + svld1_ver_vnum_za16(1, slice_base, 7, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_ver_vnum_za32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert( [[TMP4]], ptr [[TMP2]], i32 3, i32 [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svld1_ver_vnum_za32ju10__SVBool_tPKi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert( [[TMP4]], ptr [[TMP2]], i32 3, i32 [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, const int32_t *base) { + svld1_ver_vnum_za32(3, slice_base, 3, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_ver_vnum_za64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert( [[TMP4]], ptr [[TMP2]], i32 7, i32 [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svld1_ver_vnum_za64ju10__SVBool_tPKl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert( [[TMP4]], ptr [[TMP2]], i32 7, i32 [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, const int64_t *base) { + svld1_ver_vnum_za64(7, slice_base, 1, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_ver_vnum_za128( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert( [[TMP3]], ptr [[TMP2]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svld1_ver_vnum_za128ju10__SVBool_tPKn( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert( [[TMP3]], ptr [[TMP2]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, const __int128_t *base) { + svld1_ver_vnum_za128(15, slice_base, 0, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svldr_vnum_za( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 15 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[TMP3]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z18test_svldr_vnum_zaju10__SVBool_tPKh( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 15 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[TMP3]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svldr_vnum_za(uint32_t slice_base, svbool_t pg, const uint8_t *base) { + svldr_vnum_za(slice_base, 15, pg, base); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mem_ops.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mem_ops.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mem_ops.c @@ -0,0 +1,66 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK + +#include + +// CHECK-LABEL: @test_arm_sc_memcpy( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memcpy(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR2:[0-9]+]] +// CHECK-NEXT: ret ptr [[CALL]] +// +// CPP-CHECK-LABEL: @_Z18test_arm_sc_memcpyPvPKvm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memcpy(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR2:[0-9]+]] +// CPP-CHECK-NEXT: ret ptr [[CALL]] +// +__attribute__((arm_streaming_compatible)) +void *test_arm_sc_memcpy(void *dest, const void *src, size_t n) { + return __arm_sc_memcpy(dest, src, n); +} + +// CHECK-LABEL: @test_arm_sc_memmove( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memmove(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR2]] +// CHECK-NEXT: ret ptr [[CALL]] +// +// CPP-CHECK-LABEL: @_Z19test_arm_sc_memmovePvPKvm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memmove(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR2]] +// CPP-CHECK-NEXT: ret ptr [[CALL]] +// +__attribute__((arm_streaming_compatible)) +void *test_arm_sc_memmove(void *dest, const void *src, size_t n) { + return __arm_sc_memmove(dest, src, n); +} + +// CHECK-LABEL: @test_arm_sc_memset( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memset(ptr noundef [[DEST:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR2]] +// CHECK-NEXT: ret ptr [[CALL]] +// +// CPP-CHECK-LABEL: @_Z18test_arm_sc_memsetPvim( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memset(ptr noundef [[DEST:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR2]] +// CPP-CHECK-NEXT: ret ptr [[CALL]] +// +__attribute__((arm_streaming_compatible)) +void *test_arm_sc_memset(void *dest, int c, size_t n) { + return __arm_sc_memset(dest, c, n); +} + +// CHECK-LABEL: @test_arm_sc_memchr( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memchr(ptr noundef [[DEST:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR2]] +// CHECK-NEXT: ret ptr [[CALL]] +// +// CPP-CHECK-LABEL: @_Z18test_arm_sc_memchrPvim( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memchr(ptr noundef [[DEST:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR2]] +// CPP-CHECK-NEXT: ret ptr [[CALL]] +// +__attribute__((arm_streaming_compatible)) +void *test_arm_sc_memchr(void *dest, int c, size_t n) { + return __arm_sc_memchr(dest, c, n); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mop.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mop.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mop.c @@ -0,0 +1,408 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -D__ARM_FEATURE_SME_I64I64=1 -D__ARM_FEATURE_SME_F64F64=1 -S -O1 -emit-llvm \ +// RUN: -triple aarch64-none-linux-gnu -target-feature +sme -Werror -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -D__ARM_FEATURE_SME_I64I64=1 -D__ARM_FEATURE_SME_F64F64=1 -S -O1 -emit-llvm \ +// RUN: -triple aarch64-none-linux-gnu -target-feature +sme -Werror -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK + +#include + +// == MOPA / MOPS == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmopa_za32_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.wide.nxv8bf16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmopa_za32_bf16u10__SVBool_tu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.wide.nxv8bf16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmopa_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) { + svmopa_za32_bf16(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmopa_za32_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.wide.nxv8f16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmopa_za32_f16u10__SVBool_tu10__SVBool_tu13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.wide.nxv8f16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmopa_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) { + svmopa_za32_f16(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmopa_za32_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smopa.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svmopa_za32_s8u10__SVBool_tu10__SVBool_tu10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smopa.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) { + svmopa_za32_s8(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmopa_za32_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umopa.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svmopa_za32_u8u10__SVBool_tu10__SVBool_tu11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umopa.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) { + svmopa_za32_u8(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmopa_za64_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smopa.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmopa_za64_s16u10__SVBool_tu10__SVBool_tu11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smopa.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) { + svmopa_za64_s16(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmopa_za64_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umopa.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmopa_za64_u16u10__SVBool_tu10__SVBool_tu12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umopa.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) { + svmopa_za64_u16(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmops_za32_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.wide.nxv8bf16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmops_za32_bf16u10__SVBool_tu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.wide.nxv8bf16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmops_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) { + svmops_za32_bf16(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmops_za32_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.wide.nxv8f16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmops_za32_f16u10__SVBool_tu10__SVBool_tu13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.wide.nxv8f16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmops_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) { + svmops_za32_f16(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmops_za32_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smops.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svmops_za32_s8u10__SVBool_tu10__SVBool_tu10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smops.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) { + svmops_za32_s8(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmops_za32_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umops.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svmops_za32_u8u10__SVBool_tu10__SVBool_tu11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umops.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) { + svmops_za32_u8(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmops_za64_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smops.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmops_za64_s16u10__SVBool_tu10__SVBool_tu11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smops.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) { + svmops_za64_s16(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmops_za64_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umops.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmops_za64_u16u10__SVBool_tu10__SVBool_tu12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umops.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) { + svmops_za64_u16(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmopa_za32_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.nxv4f32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmopa_za32_f32u10__SVBool_tu10__SVBool_tu13__SVFloat32_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.nxv4f32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmopa_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) { + svmopa_za32_f32(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmopa_za64_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.nxv2f64(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmopa_za64_f64u10__SVBool_tu10__SVBool_tu13__SVFloat64_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.nxv2f64(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmopa_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) { + svmopa_za64_f64(7, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmops_za32_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.nxv4f32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmops_za32_f32u10__SVBool_tu10__SVBool_tu13__SVFloat32_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.nxv4f32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmops_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) { + svmops_za32_f32(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmops_za64_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.nxv2f64(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmops_za64_f64u10__SVBool_tu10__SVBool_tu13__SVFloat64_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.nxv2f64(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmops_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) { + svmops_za64_f64(7, pn, pm, zn, zm); +} + +// == MIXED SIGN MOPA / MOPS == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsumops_za32_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumops.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svsumops_za32_s8u10__SVBool_tu10__SVBool_tu10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumops.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsumops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) { + svsumops_za32_s8(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsumops_za64_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumops.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svsumops_za64_s16u10__SVBool_tu10__SVBool_tu11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumops.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsumops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) { + svsumops_za64_s16(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svusmops_za32_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmops.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svusmops_za32_u8u10__SVBool_tu10__SVBool_tu11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmops.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svusmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) { + svusmops_za32_u8(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svusmops_za64_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmops.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svusmops_za64_u16u10__SVBool_tu10__SVBool_tu12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmops.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svusmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) { + svusmops_za64_u16(3, pn, pm, zn, zm); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_reads.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_reads.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_reads.c @@ -0,0 +1,954 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK + +#include + +// == HORIZONTAL == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za8_s8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z24test_svread_hor_za8_s8_mu10__SVInt8_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint8_t test_svread_hor_za8_s8_m(svint8_t zd, svbool_t pg, uint32_t slice_base) { + return svread_hor_za8_s8_m(zd, pg, 0, slice_base, 15); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za8_u8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z24test_svread_hor_za8_u8_mu11__SVUint8_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint8_t test_svread_hor_za8_u8_m(svuint8_t zd, svbool_t pg, uint32_t slice_base) { + return svread_hor_za8_u8_m(zd, pg, 0, slice_base, 15); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_s16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za16_s16_mu11__SVInt16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svint16_t test_svread_hor_za16_s16_m(svint16_t zd, svbool_t pg, uint32_t slice_base) { + return svread_hor_za16_s16_m(zd, pg, 1, slice_base, 7); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_u16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za16_u16_mu12__SVUint16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint16_t test_svread_hor_za16_u16_m(svuint16_t zd, svbool_t pg, uint32_t slice_base) { + return svread_hor_za16_u16_m(zd, pg, 1, slice_base, 7); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_f16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za16_f16_mu13__SVFloat16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svfloat16_t test_svread_hor_za16_f16_m(svfloat16_t zd, svbool_t pg, uint32_t slice_base) { + return svread_hor_za16_f16_m(zd, pg, 1, slice_base, 7); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_hor_za16_bf16_mu14__SVBFloat16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_svread_hor_za16_bf16_m(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) { + return svread_hor_za16_bf16_m(zd, pg, 1, slice_base, 7); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za32_s32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za32_s32_mu11__SVInt32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svint32_t test_svread_hor_za32_s32_m(svint32_t zd, svbool_t pg, uint32_t slice_base) { + return svread_hor_za32_s32_m(zd, pg, 3, slice_base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za32_u32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za32_u32_mu12__SVUint32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint32_t test_svread_hor_za32_u32_m(svuint32_t zd, svbool_t pg, uint32_t slice_base) { + return svread_hor_za32_u32_m(zd, pg, 3, slice_base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za32_f32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za32_f32_mu13__SVFloat32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svfloat32_t test_svread_hor_za32_f32_m(svfloat32_t zd, svbool_t pg, uint32_t slice_base) { + return svread_hor_za32_f32_m(zd, pg, 3, slice_base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za64_s64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za64_s64_mu11__SVInt64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svint64_t test_svread_hor_za64_s64_m(svint64_t zd, svbool_t pg, uint32_t slice_base) { + return svread_hor_za64_s64_m(zd, pg, 7, slice_base, 1); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za64_u64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za64_u64_mu12__SVUint64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint64_t test_svread_hor_za64_u64_m(svuint64_t zd, svbool_t pg, uint32_t slice_base) { + return svread_hor_za64_u64_m(zd, pg, 7, slice_base, 1); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za64_f64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za64_f64_mu13__SVFloat64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svfloat64_t test_svread_hor_za64_f64_m(svfloat64_t zd, svbool_t pg, uint32_t slice_base) { + return svread_hor_za64_f64_m(zd, pg, 7, slice_base, 1); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_s8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za128_s8_mu10__SVInt8_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint8_t test_svread_hor_za128_s8_m(svint8_t zd, svbool_t pg, uint32_t slice_base) { + return svread_hor_za128_s8_m(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_u8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za128_u8_mu11__SVUint8_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint8_t test_svread_hor_za128_u8_m(svuint8_t zd, svbool_t pg, uint32_t slice_base) { + return svread_hor_za128_u8_m(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_s16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_hor_za128_s16_mu11__SVInt16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint16_t test_svread_hor_za128_s16_m(svint16_t zd, svbool_t pg, uint32_t slice_base) { + return svread_hor_za128_s16_m(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_u16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_hor_za128_u16_mu12__SVUint16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint16_t test_svread_hor_za128_u16_m(svuint16_t zd, svbool_t pg, uint32_t slice_base) { + return svread_hor_za128_u16_m(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_f16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_hor_za128_f16_mu13__SVFloat16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svfloat16_t test_svread_hor_za128_f16_m(svfloat16_t zd, svbool_t pg, uint32_t slice_base) { + return svread_hor_za128_f16_m(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za128_bf16_mu14__SVBFloat16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svread_hor_za128_bf16_m(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) { + return svread_hor_za128_bf16_m(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_s32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_hor_za128_s32_mu11__SVInt32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint32_t test_svread_hor_za128_s32_m(svint32_t zd, svbool_t pg, uint32_t slice_base) { + return svread_hor_za128_s32_m(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_u32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_hor_za128_u32_mu12__SVUint32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint32_t test_svread_hor_za128_u32_m(svuint32_t zd, svbool_t pg, uint32_t slice_base) { + return svread_hor_za128_u32_m(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_f32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_hor_za128_f32_mu13__SVFloat32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svfloat32_t test_svread_hor_za128_f32_m(svfloat32_t zd, svbool_t pg, uint32_t slice_base) { + return svread_hor_za128_f32_m(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_s64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_hor_za128_s64_mu11__SVInt64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint64_t test_svread_hor_za128_s64_m(svint64_t zd, svbool_t pg, uint32_t slice_base) { + return svread_hor_za128_s64_m(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_u64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_hor_za128_u64_mu12__SVUint64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint64_t test_svread_hor_za128_u64_m(svuint64_t zd, svbool_t pg, uint32_t slice_base) { + return svread_hor_za128_u64_m(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_f64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_hor_za128_f64_mu13__SVFloat64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svfloat64_t test_svread_hor_za128_f64_m(svfloat64_t zd, svbool_t pg, uint32_t slice_base) { + return svread_hor_za128_f64_m(zd, pg, 15, slice_base, 0); +} + +// == VERTICAL == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za8_s8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z24test_svread_ver_za8_s8_mu10__SVInt8_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint8_t test_svread_ver_za8_s8_m(svint8_t zd, svbool_t pg, uint32_t slice_base) { + return svread_ver_za8_s8_m(zd, pg, 0, slice_base, 15); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za8_u8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z24test_svread_ver_za8_u8_mu11__SVUint8_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint8_t test_svread_ver_za8_u8_m(svuint8_t zd, svbool_t pg, uint32_t slice_base) { + return svread_ver_za8_u8_m(zd, pg, 0, slice_base, 15); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_s16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za16_s16_mu11__SVInt16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svint16_t test_svread_ver_za16_s16_m(svint16_t zd, svbool_t pg, uint32_t slice_base) { + return svread_ver_za16_s16_m(zd, pg, 1, slice_base, 7); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_u16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za16_u16_mu12__SVUint16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint16_t test_svread_ver_za16_u16_m(svuint16_t zd, svbool_t pg, uint32_t slice_base) { + return svread_ver_za16_u16_m(zd, pg, 1, slice_base, 7); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_f16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za16_f16_mu13__SVFloat16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svfloat16_t test_svread_ver_za16_f16_m(svfloat16_t zd, svbool_t pg, uint32_t slice_base) { + return svread_ver_za16_f16_m(zd, pg, 1, slice_base, 7); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_ver_za16_bf16_mu14__SVBFloat16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_svread_ver_za16_bf16_m(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) { + return svread_ver_za16_bf16_m(zd, pg, 1, slice_base, 7); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za32_s32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za32_s32_mu11__SVInt32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svint32_t test_svread_ver_za32_s32_m(svint32_t zd, svbool_t pg, uint32_t slice_base) { + return svread_ver_za32_s32_m(zd, pg, 3, slice_base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za32_u32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za32_u32_mu12__SVUint32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint32_t test_svread_ver_za32_u32_m(svuint32_t zd, svbool_t pg, uint32_t slice_base) { + return svread_ver_za32_u32_m(zd, pg, 3, slice_base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za32_f32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za32_f32_mu13__SVFloat32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svfloat32_t test_svread_ver_za32_f32_m(svfloat32_t zd, svbool_t pg, uint32_t slice_base) { + return svread_ver_za32_f32_m(zd, pg, 3, slice_base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za64_s64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za64_s64_mu11__SVInt64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svint64_t test_svread_ver_za64_s64_m(svint64_t zd, svbool_t pg, uint32_t slice_base) { + return svread_ver_za64_s64_m(zd, pg, 7, slice_base, 1); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za64_u64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za64_u64_mu12__SVUint64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint64_t test_svread_ver_za64_u64_m(svuint64_t zd, svbool_t pg, uint32_t slice_base) { + return svread_ver_za64_u64_m(zd, pg, 7, slice_base, 1); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za64_f64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za64_f64_mu13__SVFloat64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svfloat64_t test_svread_ver_za64_f64_m(svfloat64_t zd, svbool_t pg, uint32_t slice_base) { + return svread_ver_za64_f64_m(zd, pg, 7, slice_base, 1); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_s8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za128_s8_mu10__SVInt8_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint8_t test_svread_ver_za128_s8_m(svint8_t zd, svbool_t pg, uint32_t slice_base) { + return svread_ver_za128_s8_m(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_u8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za128_u8_mu11__SVUint8_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint8_t test_svread_ver_za128_u8_m(svuint8_t zd, svbool_t pg, uint32_t slice_base) { + return svread_ver_za128_u8_m(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_s16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_ver_za128_s16_mu11__SVInt16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint16_t test_svread_ver_za128_s16_m(svint16_t zd, svbool_t pg, uint32_t slice_base) { + return svread_ver_za128_s16_m(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_u16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_ver_za128_u16_mu12__SVUint16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint16_t test_svread_ver_za128_u16_m(svuint16_t zd, svbool_t pg, uint32_t slice_base) { + return svread_ver_za128_u16_m(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_f16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_ver_za128_f16_mu13__SVFloat16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svfloat16_t test_svread_ver_za128_f16_m(svfloat16_t zd, svbool_t pg, uint32_t slice_base) { + return svread_ver_za128_f16_m(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za128_bf16_mu14__SVBFloat16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svread_ver_za128_bf16_m(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) { + return svread_ver_za128_bf16_m(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_s32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_ver_za128_s32_mu11__SVInt32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint32_t test_svread_ver_za128_s32_m(svint32_t zd, svbool_t pg, uint32_t slice_base) { + return svread_ver_za128_s32_m(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_u32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_ver_za128_u32_mu12__SVUint32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint32_t test_svread_ver_za128_u32_m(svuint32_t zd, svbool_t pg, uint32_t slice_base) { + return svread_ver_za128_u32_m(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_f32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_ver_za128_f32_mu13__SVFloat32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svfloat32_t test_svread_ver_za128_f32_m(svfloat32_t zd, svbool_t pg, uint32_t slice_base) { + return svread_ver_za128_f32_m(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_s64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_ver_za128_s64_mu11__SVInt64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint64_t test_svread_ver_za128_s64_m(svint64_t zd, svbool_t pg, uint32_t slice_base) { + return svread_ver_za128_s64_m(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_u64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_ver_za128_u64_mu12__SVUint64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint64_t test_svread_ver_za128_u64_m(svuint64_t zd, svbool_t pg, uint32_t slice_base) { + return svread_ver_za128_u64_m(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_f64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_ver_za128_f64_mu13__SVFloat64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svfloat64_t test_svread_ver_za128_f64_m(svfloat64_t zd, svbool_t pg, uint32_t slice_base) { + return svread_ver_za128_f64_m(zd, pg, 15, slice_base, 0); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c @@ -0,0 +1,79 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK + +#include + +// CHECK-LABEL: @test_in_streaming_mode_from_normal( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR4:[0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = icmp ne i64 [[TMP2]], 0 +// CHECK-NEXT: ret i1 [[TMP3]] +// +// CPP-CHECK-LABEL: @_Z34test_in_streaming_mode_from_normalv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR4:[0-9]+]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = icmp ne i64 [[TMP2]], 0 +// CPP-CHECK-NEXT: ret i1 [[TMP3]] +// +bool test_in_streaming_mode_from_normal(void) { + return __arm_in_streaming_mode(); +} + +// +// +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_in_streaming_mode_from_streaming( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR4]] +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = icmp ne i64 [[TMP2]], 0 +// CHECK-NEXT: ret i1 [[TMP3]] +// +// CPP-CHECK-LABEL: @_Z37test_in_streaming_mode_from_streamingv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR4]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = icmp ne i64 [[TMP2]], 0 +// CPP-CHECK-NEXT: ret i1 [[TMP3]] +// +bool test_in_streaming_mode_from_streaming(void) { + return __arm_in_streaming_mode(); +} + +// CHECK-LABEL: @test_disable_za( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @__arm_disable_za() #[[ATTR5:[0-9]+]] +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_disable_zav( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @__arm_disable_za() #[[ATTR5:[0-9]+]] +// CPP-CHECK-NEXT: ret void +// +void test_disable_za(void) { + __arm_disable_za(); +} + +// +// +__attribute__((arm_streaming_compatible, arm_shared_za)) +// CHECK-LABEL: @test_svundef_za( +// CHECK-NEXT: entry: +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svundef_zav( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: ret void +// +void test_svundef_za(void) { + svundef_za(); +} + diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_stores.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_stores.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_stores.c @@ -0,0 +1,499 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK + +#include + +// == HORIZONTAL == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_hor_za8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz( [[PG:%.*]], ptr [[BASE:%.*]], i32 0, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z18test_svst1_hor_za8ju10__SVBool_tPa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz( [[PG:%.*]], ptr [[BASE:%.*]], i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_za8(uint32_t slice_base, svbool_t pg, int8_t *base) { + svst1_hor_za8(0, slice_base, 15, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_hor_za16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 1, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svst1_hor_za16ju10__SVBool_tPs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_za16(uint32_t slice_base, svbool_t pg, int16_t *base) { + svst1_hor_za16(1, slice_base, 7, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_hor_za32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 3, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svst1_hor_za32ju10__SVBool_tPi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 3, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_za32(uint32_t slice_base, svbool_t pg, int32_t *base) { + svst1_hor_za32(3, slice_base, 3, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_hor_za64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 7, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svst1_hor_za64ju10__SVBool_tPl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 7, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_za64(uint32_t slice_base, svbool_t pg, int64_t *base) { + svst1_hor_za64(7, slice_base, 1, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_hor_za128( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz( [[TMP0]], ptr [[BASE:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst1_hor_za128ju10__SVBool_tPn( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz( [[TMP0]], ptr [[BASE:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_za128(uint32_t slice_base, svbool_t pg, __int128_t *base) { + svst1_hor_za128(15, slice_base, 0, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_hor_vnum_za8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz( [[PG:%.*]], ptr [[TMP2]], i32 0, i32 [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svst1_hor_vnum_za8ju10__SVBool_tPa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz( [[PG:%.*]], ptr [[TMP2]], i32 0, i32 [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, int8_t *base) { + svst1_hor_vnum_za8(0, slice_base, 15, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_hor_vnum_za16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz( [[TMP4]], ptr [[TMP2]], i32 1, i32 [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svst1_hor_vnum_za16ju10__SVBool_tPs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz( [[TMP4]], ptr [[TMP2]], i32 1, i32 [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, int16_t *base) { + svst1_hor_vnum_za16(1, slice_base, 7, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_hor_vnum_za32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz( [[TMP4]], ptr [[TMP2]], i32 3, i32 [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svst1_hor_vnum_za32ju10__SVBool_tPi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz( [[TMP4]], ptr [[TMP2]], i32 3, i32 [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, int32_t *base) { + svst1_hor_vnum_za32(3, slice_base, 3, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_hor_vnum_za64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz( [[TMP4]], ptr [[TMP2]], i32 7, i32 [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svst1_hor_vnum_za64ju10__SVBool_tPl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz( [[TMP4]], ptr [[TMP2]], i32 7, i32 [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, int64_t *base) { + svst1_hor_vnum_za64(7, slice_base, 1, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_hor_vnum_za128( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz( [[TMP3]], ptr [[TMP2]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svst1_hor_vnum_za128ju10__SVBool_tPn( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz( [[TMP3]], ptr [[TMP2]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, __int128_t *base) { + svst1_hor_vnum_za128(15, slice_base, 0, pg, base, 3); +} + +// == VERTICAL == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_ver_za8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.vert( [[PG:%.*]], ptr [[BASE:%.*]], i32 0, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z18test_svst1_ver_za8ju10__SVBool_tPa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.vert( [[PG:%.*]], ptr [[BASE:%.*]], i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_za8(uint32_t slice_base, svbool_t pg, int8_t *base) { + svst1_ver_za8(0, slice_base, 15, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_ver_za16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.vert( [[TMP1]], ptr [[BASE:%.*]], i32 1, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svst1_ver_za16ju10__SVBool_tPs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.vert( [[TMP1]], ptr [[BASE:%.*]], i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_za16(uint32_t slice_base, svbool_t pg, int16_t *base) { + svst1_ver_za16(1, slice_base, 7, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_ver_za32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.vert( [[TMP1]], ptr [[BASE:%.*]], i32 3, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svst1_ver_za32ju10__SVBool_tPi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.vert( [[TMP1]], ptr [[BASE:%.*]], i32 3, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_za32(uint32_t slice_base, svbool_t pg, int32_t *base) { + svst1_ver_za32(3, slice_base, 3, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_ver_za64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.vert( [[TMP1]], ptr [[BASE:%.*]], i32 7, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svst1_ver_za64ju10__SVBool_tPl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.vert( [[TMP1]], ptr [[BASE:%.*]], i32 7, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_za64(uint32_t slice_base, svbool_t pg, int64_t *base) { + svst1_ver_za64(7, slice_base, 1, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_ver_za128( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.vert( [[TMP0]], ptr [[BASE:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst1_ver_za128ju10__SVBool_tPn( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.vert( [[TMP0]], ptr [[BASE:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_za128(uint32_t slice_base, svbool_t pg, __int128_t *base) { + svst1_ver_za128(15, slice_base, 0, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_ver_vnum_za8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.vert( [[PG:%.*]], ptr [[TMP2]], i32 0, i32 [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svst1_ver_vnum_za8ju10__SVBool_tPa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.vert( [[PG:%.*]], ptr [[TMP2]], i32 0, i32 [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_vnum_za8(uint32_t slice_base, svbool_t pg, int8_t *base) { + svst1_ver_vnum_za8(0, slice_base, 15, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_ver_vnum_za16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.vert( [[TMP4]], ptr [[TMP2]], i32 1, i32 [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svst1_ver_vnum_za16ju10__SVBool_tPs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.vert( [[TMP4]], ptr [[TMP2]], i32 1, i32 [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, int16_t *base) { + svst1_ver_vnum_za16(1, slice_base, 7, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_ver_vnum_za32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.vert( [[TMP4]], ptr [[TMP2]], i32 3, i32 [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svst1_ver_vnum_za32ju10__SVBool_tPi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.vert( [[TMP4]], ptr [[TMP2]], i32 3, i32 [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, int32_t *base) { + svst1_ver_vnum_za32(3, slice_base, 3, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_ver_vnum_za64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.vert( [[TMP4]], ptr [[TMP2]], i32 7, i32 [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svst1_ver_vnum_za64ju10__SVBool_tPl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.vert( [[TMP4]], ptr [[TMP2]], i32 7, i32 [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, int64_t *base) { + svst1_ver_vnum_za64(7, slice_base, 1, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_ver_vnum_za128( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.vert( [[TMP3]], ptr [[TMP2]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svst1_ver_vnum_za128ju10__SVBool_tPn( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.vert( [[TMP3]], ptr [[TMP2]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, __int128_t *base) { + svst1_ver_vnum_za128(15, slice_base, 0, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svstr_vnum_za( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 15 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[TMP3]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z18test_svstr_vnum_zaju10__SVBool_tPh( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 15 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[TMP3]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstr_vnum_za(uint32_t slice_base, svbool_t pg, uint8_t *base) { + svstr_vnum_za(slice_base, 15, pg, base); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_writes.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_writes.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_writes.c @@ -0,0 +1,956 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK + +#include + +// == HORIZONTAL == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za8_s8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[TMP0]], [[PG:%.*]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svwrite_hor_za8_s8_mju10__SVBool_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[TMP0]], [[PG:%.*]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za8_s8_m(uint32_t slice_base, svbool_t pg, svint8_t zn) { + svwrite_hor_za8_s8_m(0, slice_base, 15, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za8_u8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[TMP0]], [[PG:%.*]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svwrite_hor_za8_u8_mju10__SVBool_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[TMP0]], [[PG:%.*]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za8_u8_m(uint32_t slice_base, svbool_t pg, svuint8_t zn) { + svwrite_hor_za8_u8_m(0, slice_base, 15, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_s16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 1, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za16_s16_mju10__SVBool_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 1, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_s16_m(uint32_t slice_base, svbool_t pg, svint16_t zn) { + svwrite_hor_za16_s16_m(1, slice_base, 7, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_u16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 1, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za16_u16_mju10__SVBool_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 1, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_u16_m(uint32_t slice_base, svbool_t pg, svuint16_t zn) { + svwrite_hor_za16_u16_m(1, slice_base, 7, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_f16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8f16(i32 1, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za16_f16_mju10__SVBool_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8f16(i32 1, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_f16_m(uint32_t slice_base, svbool_t pg, svfloat16_t zn) { + svwrite_hor_za16_f16_m(1, slice_base, 7, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8bf16(i32 1, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_hor_za16_bf16_mju10__SVBool_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8bf16(i32 1, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_bf16_m(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) { + svwrite_hor_za16_bf16_m(1, slice_base, 7, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za32_s32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 3, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za32_s32_mju10__SVBool_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 3, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za32_s32_m(uint32_t slice_base, svbool_t pg, svint32_t zn) { + svwrite_hor_za32_s32_m(3, slice_base, 3, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za32_u32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 3, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za32_u32_mju10__SVBool_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 3, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za32_u32_m(uint32_t slice_base, svbool_t pg, svuint32_t zn) { + svwrite_hor_za32_u32_m(3, slice_base, 3, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za32_f32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4f32(i32 3, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za32_f32_mju10__SVBool_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4f32(i32 3, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za32_f32_m(uint32_t slice_base, svbool_t pg, svfloat32_t zn) { + svwrite_hor_za32_f32_m(3, slice_base, 3, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za64_s64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 7, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za64_s64_mju10__SVBool_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 7, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za64_s64_m(uint32_t slice_base, svbool_t pg, svint64_t zn) { + svwrite_hor_za64_s64_m(7, slice_base, 1, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za64_u64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 7, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za64_u64_mju10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 7, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za64_u64_m(uint32_t slice_base, svbool_t pg, svuint64_t zn) { + svwrite_hor_za64_u64_m(7, slice_base, 1, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za64_f64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2f64(i32 7, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za64_f64_mju10__SVBool_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2f64(i32 7, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za64_f64_m(uint32_t slice_base, svbool_t pg, svfloat64_t zn) { + svwrite_hor_za64_f64_m(7, slice_base, 1, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_s8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za128_s8_mju10__SVBool_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_s8_m(uint32_t slice_base, svbool_t pg, svint8_t zn) { + svwrite_hor_za128_s8_m(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_u8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za128_u8_mju10__SVBool_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_u8_m(uint32_t slice_base, svbool_t pg, svuint8_t zn) { + svwrite_hor_za128_u8_m(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_s16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_hor_za128_s16_mju10__SVBool_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_s16_m(uint32_t slice_base, svbool_t pg, svint16_t zn) { + svwrite_hor_za128_s16_m(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_u16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_hor_za128_u16_mju10__SVBool_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_u16_m(uint32_t slice_base, svbool_t pg, svuint16_t zn) { + svwrite_hor_za128_u16_m(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_f16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8f16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_hor_za128_f16_mju10__SVBool_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8f16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_f16_m(uint32_t slice_base, svbool_t pg, svfloat16_t zn) { + svwrite_hor_za128_f16_m(15, slice_base, 0, pg, zn); +} + +// +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8bf16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za128_bf16_mju10__SVBool_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8bf16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_bf16_m(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) { + svwrite_hor_za128_bf16_m(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_s32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_hor_za128_s32_mju10__SVBool_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_s32_m(uint32_t slice_base, svbool_t pg, svint32_t zn) { + svwrite_hor_za128_s32_m(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_u32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_hor_za128_u32_mju10__SVBool_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_u32_m(uint32_t slice_base, svbool_t pg, svuint32_t zn) { + svwrite_hor_za128_u32_m(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_f32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4f32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_hor_za128_f32_mju10__SVBool_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4f32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_f32_m(uint32_t slice_base, svbool_t pg, svfloat32_t zn) { + svwrite_hor_za128_f32_m(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_s64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_hor_za128_s64_mju10__SVBool_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_s64_m(uint32_t slice_base, svbool_t pg, svint64_t zn) { + svwrite_hor_za128_s64_m(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_u64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_hor_za128_u64_mju10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_u64_m(uint32_t slice_base, svbool_t pg, svuint64_t zn) { + svwrite_hor_za128_u64_m(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_f64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2f64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_hor_za128_f64_mju10__SVBool_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2f64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_f64_m(uint32_t slice_base, svbool_t pg, svfloat64_t zn) { + svwrite_hor_za128_f64_m(15, slice_base, 0, pg, zn); +} + +// == VERTICAL == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za8_s8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[TMP0]], [[PG:%.*]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svwrite_ver_za8_s8_mju10__SVBool_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[TMP0]], [[PG:%.*]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za8_s8_m(uint32_t slice_base, svbool_t pg, svint8_t zn) { + // + svwrite_ver_za8_s8_m(0, slice_base, 15, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za8_u8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[TMP0]], [[PG:%.*]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svwrite_ver_za8_u8_mju10__SVBool_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[TMP0]], [[PG:%.*]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za8_u8_m(uint32_t slice_base, svbool_t pg, svuint8_t zn) { + svwrite_ver_za8_u8_m(0, slice_base, 15, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_s16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 1, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za16_s16_mju10__SVBool_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 1, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_s16_m(uint32_t slice_base, svbool_t pg, svint16_t zn) { + svwrite_ver_za16_s16_m(1, slice_base, 7, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_u16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 1, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za16_u16_mju10__SVBool_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 1, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_u16_m(uint32_t slice_base, svbool_t pg, svuint16_t zn) { + svwrite_ver_za16_u16_m(1, slice_base, 7, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_f16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8f16(i32 1, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za16_f16_mju10__SVBool_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8f16(i32 1, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_f16_m(uint32_t slice_base, svbool_t pg, svfloat16_t zn) { + svwrite_ver_za16_f16_m(1, slice_base, 7, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8bf16(i32 1, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_ver_za16_bf16_mju10__SVBool_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8bf16(i32 1, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_bf16_m(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) { + svwrite_ver_za16_bf16_m(1, slice_base, 7, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za32_s32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 3, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za32_s32_mju10__SVBool_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 3, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za32_s32_m(uint32_t slice_base, svbool_t pg, svint32_t zn) { + svwrite_ver_za32_s32_m(3, slice_base, 3, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za32_u32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 3, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za32_u32_mju10__SVBool_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 3, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za32_u32_m(uint32_t slice_base, svbool_t pg, svuint32_t zn) { + svwrite_ver_za32_u32_m(3, slice_base, 3, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za32_f32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4f32(i32 3, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za32_f32_mju10__SVBool_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4f32(i32 3, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za32_f32_m(uint32_t slice_base, svbool_t pg, svfloat32_t zn) { + svwrite_ver_za32_f32_m(3, slice_base, 3, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za64_s64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 7, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za64_s64_mju10__SVBool_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 7, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za64_s64_m(uint32_t slice_base, svbool_t pg, svint64_t zn) { + svwrite_ver_za64_s64_m(7, slice_base, 1, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za64_u64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 7, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za64_u64_mju10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 7, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za64_u64_m(uint32_t slice_base, svbool_t pg, svuint64_t zn) { + svwrite_ver_za64_u64_m(7, slice_base, 1, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za64_f64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2f64(i32 7, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za64_f64_mju10__SVBool_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2f64(i32 7, i32 [[TMP1]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za64_f64_m(uint32_t slice_base, svbool_t pg, svfloat64_t zn) { + svwrite_ver_za64_f64_m(7, slice_base, 1, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_s8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za128_s8_mju10__SVBool_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_s8_m(uint32_t slice_base, svbool_t pg, svint8_t zn) { + svwrite_ver_za128_s8_m(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_u8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za128_u8_mju10__SVBool_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_u8_m(uint32_t slice_base, svbool_t pg, svuint8_t zn) { + svwrite_ver_za128_u8_m(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_s16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_ver_za128_s16_mju10__SVBool_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_s16_m(uint32_t slice_base, svbool_t pg, svint16_t zn) { + svwrite_ver_za128_s16_m(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_u16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_ver_za128_u16_mju10__SVBool_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_u16_m(uint32_t slice_base, svbool_t pg, svuint16_t zn) { + svwrite_ver_za128_u16_m(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_f16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8f16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_ver_za128_f16_mju10__SVBool_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8f16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_f16_m(uint32_t slice_base, svbool_t pg, svfloat16_t zn) { + svwrite_ver_za128_f16_m(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8bf16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za128_bf16_mju10__SVBool_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8bf16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_bf16_m(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) { + svwrite_ver_za128_bf16_m(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_s32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_ver_za128_s32_mju10__SVBool_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_s32_m(uint32_t slice_base, svbool_t pg, svint32_t zn) { + svwrite_ver_za128_s32_m(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_u32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_ver_za128_u32_mju10__SVBool_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_u32_m(uint32_t slice_base, svbool_t pg, svuint32_t zn) { + svwrite_ver_za128_u32_m(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_f32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4f32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_ver_za128_f32_mju10__SVBool_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4f32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_f32_m(uint32_t slice_base, svbool_t pg, svfloat32_t zn) { + svwrite_ver_za128_f32_m(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_s64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_ver_za128_s64_mju10__SVBool_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_s64_m(uint32_t slice_base, svbool_t pg, svint64_t zn) { + svwrite_ver_za128_s64_m(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_u64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_ver_za128_u64_mju10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_u64_m(uint32_t slice_base, svbool_t pg, svuint64_t zn) { + svwrite_ver_za128_u64_m(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_f64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2f64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_ver_za128_f64_mju10__SVBool_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2f64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_f64_m(uint32_t slice_base, svbool_t pg, svfloat64_t zn) { + svwrite_ver_za128_f64_m(15, slice_base, 0, pg, zn); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c @@ -0,0 +1,40 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK + +#include + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svzero_mask_za_3( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.zero(i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svzero_mask_za_3v( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.zero(i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svzero_mask_za_3(void) { + svzero_mask_za(3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svzero_za( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.zero(i32 255) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z14test_svzero_zav( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.zero(i32 255) +// CPP-CHECK-NEXT: ret void +// +void test_svzero_za(void) { + svzero_za(); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_add.c b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_add.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_add.c @@ -0,0 +1,52 @@ +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -D__ARM_FEATURE_SME_I64I64=1 -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sme -fsyntax-only -verify %s + +#include + +__attribute__((arm_streaming, arm_shared_za)) +void test_svaddha_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svaddha_za32_s32(4, pn, pm, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svaddha_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svaddha_za32_u32(4, pn, pm, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svaddha_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svaddha_za64_s64(8, pn, pm, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svaddha_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svaddha_za64_u64(8, pn, pm, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svaddva_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svaddva_za32_s32(4, pn, pm, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svaddva_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svaddva_za32_u32(4, pn, pm, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svaddva_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svaddva_za64_s64(8, pn, pm, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svaddva_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svaddva_za64_u64(8, pn, pm, zn); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_loads.c b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_loads.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_loads.c @@ -0,0 +1,69 @@ +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu -target-feature +sme -fsyntax-only -verify %s + +#include + +__attribute__((arm_streaming, arm_shared_za)) +void test_svld1_hor_za8_bad_tile(uint32_t slice_base, svbool_t pg, const int8_t *base) { + // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}} + svld1_hor_za8(1, slice_base, 0, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svld1_hor_za16_bad_tile(uint32_t slice_base, svbool_t pg, const int8_t *base) { + // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} + svld1_hor_za16(2, slice_base, 0, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svld1_hor_za32_bad_tile(uint32_t slice_base, svbool_t pg, const int8_t *base) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svld1_hor_za32(4, slice_base, 0, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svld1_ver_za64_bad_tile(uint32_t slice_base, svbool_t pg, const int8_t *base) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svld1_ver_za64(8, slice_base, 0, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svld1_ver_za128_bad_tile(uint32_t slice_base, svbool_t pg, const int8_t *base) { + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svld1_ver_za128(16, slice_base, 0, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svld1_hor_za8_slice_offset(uint32_t slice_base, svbool_t pg, const int8_t *base) { + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svld1_hor_za8(0, slice_base, 16, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svld1_ver_za16_slice_offset(uint32_t slice_base, svbool_t pg, const int8_t *base) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svld1_ver_za16(0, slice_base, 8, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svld1_hor_za32_slice_offset(uint32_t slice_base, svbool_t pg, const int8_t *base) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svld1_hor_za32(0, slice_base, 4, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svld1_ver_za64_slice_offset(uint32_t slice_base, svbool_t pg, const int8_t *base) { + // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} + svld1_ver_za64(0, slice_base, 2, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svld1_ver_za128_slice_offset(uint32_t slice_base, svbool_t pg, const int8_t *base) { + // expected-error@+1 {{argument value 16 is outside the valid range [0, 0]}} + svld1_ver_za128(0, slice_base, 16, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svldr_vnum_za(uint32_t slice_base, svbool_t pg, const uint8_t *base) { + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svldr_vnum_za(slice_base, 16, pg, base); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_mop.c b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_mop.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_mop.c @@ -0,0 +1,128 @@ +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -D__ARM_FEATURE_SME_I64I64=1 -D__ARM_FEATURE_SME_F64F64=1 \ +// RUN: -triple aarch64-none-linux-gnu -target-feature +sme -fsyntax-only -verify %s + +#include + +// == MOPS / MOPA == + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmopa_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svmopa_za32_bf16(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmopa_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svmopa_za32_f16(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svmopa_za32_s8(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svmopa_za32_u8(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svmopa_za64_s16(8, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svmopa_za64_u16(8, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmopa_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svmopa_za32_f32(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmopa_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svmopa_za64_f64(8, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmops_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svmops_za32_bf16(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmops_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svmops_za32_f16(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svmops_za32_s8(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svmops_za32_u8(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svmops_za64_s16(8, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svmops_za64_u16(8, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmops_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svmops_za32_f32(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmops_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svmops_za64_f64(8, pn, pm, zn, zm); +} + +// == MIXED SIGN MOPA / MOPS == + +__attribute__((arm_streaming, arm_shared_za)) +void test_svsumops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svsumops_za32_s8(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svsumops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svsumops_za64_s16(8, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svusmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svusmops_za32_u8(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svusmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svusmops_za64_u16(8, pn, pm, zn, zm); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_reads.c b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_reads.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_reads.c @@ -0,0 +1,63 @@ +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu -target-feature +sme -fsyntax-only -verify %s + +#include + +__attribute__((arm_streaming, arm_shared_za)) +void test_svread_hor_za8_bad_tile(svint8_t zd, svbool_t pg, uint32_t slice_base) { + // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}} + svread_hor_za8_s8_m(zd, pg, 1, slice_base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svread_ver_za16_bad_tile(svuint16_t zd, svbool_t pg, uint32_t slice_base) { + // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} + svread_ver_za16_u16_m(zd, pg, 2, slice_base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svread_ver_za32_bad_tile(svfloat32_t zd, svbool_t pg, uint32_t slice_base) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svread_ver_za32_f32_m(zd, pg, 4, slice_base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svread_hor_za64_bad_tile(svint64_t zd, svbool_t pg, uint32_t slice_base) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svread_hor_za64_s64_m(zd, pg, 8, slice_base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svread_hor_za128_bad_tile(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) { + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svread_hor_za128_bf16_m(zd, pg, 16, slice_base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svread_hor_za8_bad_slice(svint8_t zd, svbool_t pg, uint32_t slice_base) { + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svread_hor_za8_s8_m(zd, pg, 0, slice_base, 16); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svread_ver_za16_bad_slice(svuint16_t zd, svbool_t pg, uint32_t slice_base) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svread_ver_za16_u16_m(zd, pg, 1, slice_base, 8); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svread_ver_za32_bad_slice(svfloat32_t zd, svbool_t pg, uint32_t slice_base) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svread_ver_za32_f32_m(zd, pg, 3, slice_base, 4); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svread_hor_za64_bad_slice(svint64_t zd, svbool_t pg, uint32_t slice_base) { + // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} + svread_hor_za64_s64_m(zd, pg, 7, slice_base, 2); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svread_hor_za128_bad_slice(svint32_t zd, svbool_t pg, uint32_t slice_base) { + // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}} + svread_hor_za128_s32_m(zd, pg, 15, slice_base, 1); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_stores.c b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_stores.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_stores.c @@ -0,0 +1,69 @@ +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu -target-feature +sme -fsyntax-only -verify %s + +#include + +__attribute__((arm_streaming, arm_shared_za)) +void test_svst1_hor_za8_bad_tile(uint32_t slice_base, svbool_t pg, int8_t *base) { + // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}} + svst1_hor_za8(1, slice_base, 0, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svst1_hor_za16_bad_tile(uint32_t slice_base, svbool_t pg, int8_t *base) { + // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} + svst1_hor_za16(2, slice_base, 0, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svst1_hor_za32_bad_tile(uint32_t slice_base, svbool_t pg, int8_t *base) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svst1_hor_za32(4, slice_base, 0, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svst1_ver_za64_bad_tile(uint32_t slice_base, svbool_t pg, int8_t *base) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svst1_ver_za64(8, slice_base, 0, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svst1_ver_za128_bad_tile(uint32_t slice_base, svbool_t pg, int8_t *base) { + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svst1_ver_za128(16, slice_base, 0, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svst1_hor_za8_slice_offset(uint32_t slice_base, svbool_t pg, int8_t *base) { + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svst1_hor_za8(0, slice_base, 16, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svst1_ver_za16_slice_offset(uint32_t slice_base, svbool_t pg, int8_t *base) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svst1_ver_za16(0, slice_base, 8, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svst1_hor_za32_slice_offset(uint32_t slice_base, svbool_t pg, int8_t *base) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svst1_hor_za32(0, slice_base, 4, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svst1_ver_za64_slice_offset(uint32_t slice_base, svbool_t pg, int8_t *base) { + // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} + svst1_ver_za64(0, slice_base, 2, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svst1_ver_za128_slice_offset(uint32_t slice_base, svbool_t pg, int8_t *base) { + // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}} + svst1_ver_za128(0, slice_base, 1, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svstr_vnum_za(uint32_t slice_base, svbool_t pg, uint8_t *base) { + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svstr_vnum_za(slice_base, 16, pg, base); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_writes.c b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_writes.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_writes.c @@ -0,0 +1,63 @@ +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu -target-feature +sme -fsyntax-only -verify %s + +#include + +__attribute__((arm_streaming, arm_shared_za)) +void test_svwrite_hor_za8_bad_tile(uint32_t slice_base, svbool_t pg, svint8_t zn) { + // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}} + svwrite_hor_za8_s8_m(1, slice_base, 0, pg, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svwrite_ver_za16_bad_tile(uint32_t slice_base, svbool_t pg, svuint16_t zn) { + // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} + svwrite_ver_za16_u16_m(2, slice_base, 0, pg, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svwrite_ver_za32_bad_tile(uint32_t slice_base, svbool_t pg, svfloat32_t zn) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svwrite_ver_za32_f32_m(4, slice_base, 0, pg, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svwrite_hor_za64_bad_tile(uint32_t slice_base, svbool_t pg, svint64_t zn) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svwrite_hor_za64_s64_m(8, slice_base, 0, pg, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svwrite_hor_za128_bad_tile(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) { + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svwrite_hor_za128_bf16_m(16, slice_base, 0, pg, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svwrite_hor_za8_bad_slice(uint32_t slice_base, svbool_t pg, svint8_t zn) { + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svwrite_hor_za8_s8_m(0, slice_base, 16, pg, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svwrite_ver_za16_bad_slice(uint32_t slice_base, svbool_t pg, svuint16_t zn) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svwrite_ver_za16_u16_m(1, slice_base, 8, pg, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svwrite_ver_za32_bad_slice(uint32_t slice_base, svbool_t pg, svfloat32_t zn) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svwrite_ver_za32_f32_m(3, slice_base, 4, pg, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svwrite_hor_za64_bad_slice(uint32_t slice_base, svbool_t pg, svint64_t zn) { + // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} + svwrite_hor_za64_s64_m(7, slice_base, 2, pg, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svwrite_hor_za128_bad_slice(svint32_t zd, svbool_t pg, uint32_t slice_base) { + // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}} + svwrite_hor_za128_s32_m(15, slice_base, 1, pg, zd); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_zero.c b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_zero.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_zero.c @@ -0,0 +1,9 @@ +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -fsyntax-only -verify %s + +#include + +__attribute__((arm_streaming, arm_shared_za)) +void test_svzero_mask_za(void) { + // expected-error@+1 {{argument value 256 is outside the valid range [0, 255]}} + svzero_mask_za(256); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c @@ -0,0 +1,348 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -D__ARM_FEATURE_SME -D__ARM_FEATURE_SME2 -D__ARM_FEATURE_SME_F64F64 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -mem2reg -instcombine -tailcallelim | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SME -D__ARM_FEATURE_SME2 -D__ARM_FEATURE_SME_F64F64 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -mem2reg -instcombine -tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -D__ARM_FEATURE_SME -D__ARM_FEATURE_SME2 -D__ARM_FEATURE_SME_F64F64 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -mem2reg -instcombine -tailcallelim | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SME -D__ARM_FEATURE_SME2 -D__ARM_FEATURE_SME_F64F64 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -mem2reg -instcombine -tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -D__ARM_FEATURE_SME -D__ARM_FEATURE_SME2 -D__ARM_FEATURE_SME_F64F64 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// +// Multi, single +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_multi2_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.multi.vg1x2.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmla_multi2_f32j13svfloat32x2_t13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.multi.vg1x2.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_multi2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32x2_t zm) { + SVE_ACLE_FUNC(svmla,,_za32,_f32,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_multi4_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.multi.vg1x4.nxv4f32(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmla_multi4_f32j13svfloat32x4_t13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.multi.vg1x4.nxv4f32(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_multi4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32x4_t zm) { + SVE_ACLE_FUNC(svmla,,_za32,_f32,_vg1x4)(slice_base, 7, zn, zm); +} + +// +// Multi, single +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single2_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv4f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv4f32(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmla_single2_f32j13svfloat32x2_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv4f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv4f32(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) { + SVE_ACLE_FUNC(svmla,_single,_za32,_f32,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single4_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv4f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv4f32(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmla_single4_f32j13svfloat32x4_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv4f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv4f32(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) { + SVE_ACLE_FUNC(svmla,_single,_za32,_f32,_vg1x4)(slice_base, 7, zn, zm); +} + +// +// Multi, indexed +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane2_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv4f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv4f32(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmla_lane2_f32j13svfloat32x2_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv4f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv4f32(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) { + SVE_ACLE_FUNC(svmla_lane,,_za32,_f32,_vg1x2)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane4_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv4f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv4f32(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmla_lane4_f32j13svfloat32x4_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv4f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv4f32(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) { + SVE_ACLE_FUNC(svmla_lane,,_za32,_f32,_vg1x4)(slice_base, 7, zn, zm, 3); +} + +// +// Multi, single +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_multi2_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.multi.vg1x2.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmla_multi2_f64j13svfloat64x2_t13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.multi.vg1x2.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_multi2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64x2_t zm) { + SVE_ACLE_FUNC(svmla,,_za64,_f64,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_multi4_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.multi.vg1x4.nxv2f64(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmla_multi4_f64j13svfloat64x4_t13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.multi.vg1x4.nxv2f64(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_multi4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64x4_t zm) { + SVE_ACLE_FUNC(svmla,,_za64,_f64,_vg1x4)(slice_base, 7, zn, zm); +} + +// +// Multi, single +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single2_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv2f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv2f64(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmla_single2_f64j13svfloat64x2_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv2f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv2f64(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) { + SVE_ACLE_FUNC(svmla,_single,_za64,_f64,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single4_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv2f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv2f64(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmla_single4_f64j13svfloat64x4_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv2f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv2f64(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) { + SVE_ACLE_FUNC(svmla,_single,_za64,_f64,_vg1x4)(slice_base, 7, zn, zm); +} + +// +// Multi, indexed +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane2_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv2f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv2f64(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]], i32 1) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmla_lane2_f64j13svfloat64x2_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv2f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv2f64(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]], i32 1) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) { + SVE_ACLE_FUNC(svmla_lane,,_za64,_f64,_vg1x2)(slice_base, 7, zn, zm, 1); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane4_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv2f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv2f64(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], i32 1) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmla_lane4_f64j13svfloat64x4_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv2f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv2f64(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], i32 1) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) { + SVE_ACLE_FUNC(svmla_lane,,_za64,_f64,_vg1x4)(slice_base, 7, zn, zm, 1); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c @@ -0,0 +1,856 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -D__ARM_FEATURE_SME -D__ARM_FEATURE_SME2 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -mem2reg -instcombine -tailcallelim | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SME -D__ARM_FEATURE_SME2 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -mem2reg -instcombine -tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -D__ARM_FEATURE_SME -D__ARM_FEATURE_SME2 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -mem2reg -instcombine -tailcallelim | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SME -D__ARM_FEATURE_SME2 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -mem2reg -instcombine -tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -D__ARM_FEATURE_SME -D__ARM_FEATURE_SME2 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// +// Multi, multi +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlal_multi2_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.multi.vg2x2.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmlal_multi2_f16j13svfloat16x2_t13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.multi.vg2x2.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlal_multi2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16x2_t zm) +{ + SVE_ACLE_FUNC(svmlal_za32,_f16,_vg2x2,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlal_multi2_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.multi.vg2x2.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmlal_multi2_bf16j14svbfloat16x2_t14svbfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.multi.vg2x2.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlal_multi2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16x2_t zm) +{ + SVE_ACLE_FUNC(svmlal_za32,_bf16,_vg2x2,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlal_multi2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.multi.vg2x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmlal_multi2_u16j12svuint16x2_t12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.multi.vg2x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlal_multi2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) +{ + SVE_ACLE_FUNC(svmlal_za32,_u16,_vg2x2,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlal_multi2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.multi.vg2x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmlal_multi2_s16j11svint16x2_t11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.multi.vg2x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlal_multi2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) +{ + SVE_ACLE_FUNC(svmlal_za32,_s16,_vg2x2,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlal_multi4_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.multi.vg2x4.nxv8f16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmlal_multi4_f16j13svfloat16x4_t13svfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.multi.vg2x4.nxv8f16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlal_multi4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16x4_t zm) +{ + SVE_ACLE_FUNC(svmlal_za32,_f16,_vg2x4,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlal_multi4_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.multi.vg2x4.nxv8bf16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmlal_multi4_bf16j14svbfloat16x4_t14svbfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.multi.vg2x4.nxv8bf16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlal_multi4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16x4_t zm) +{ + SVE_ACLE_FUNC(svmlal_za32,_bf16,_vg2x4,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlal_multi4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.multi.vg2x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmlal_multi4_u16j12svuint16x4_t12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.multi.vg2x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlal_multi4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) +{ + SVE_ACLE_FUNC(svmlal_za32,_u16,_vg2x4,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlal_multi4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.multi.vg2x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmlal_multi4_s16j11svint16x4_t11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.multi.vg2x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlal_multi4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) +{ + SVE_ACLE_FUNC(svmlal_za32,_s16,_vg2x4,,)(slice_base, 6, zn, zm); +} + +// +// Multi, single +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlal_single1_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv8f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv8f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmlal_single1_f16ju13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv8f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv8f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlal_single1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmlal,_single,_za32,_f16,_vg2x1)(slice_base, 14, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlal_single1_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv8bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv8bf16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svmlal_single1_bf16ju14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv8bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv8bf16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlal_single1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmlal,_single,_za32,_bf16,_vg2x1)(slice_base, 14, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlal_single1_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.single.vg2x1.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmlal_single1_u16ju12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.single.vg2x1.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlal_single1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmlal,_single,_za32,_u16,_vg2x1)(slice_base, 14, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlal_single1_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.single.vg2x1.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmlal_single1_s16ju11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.single.vg2x1.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlal_single1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmlal,_single,_za32,_s16,_vg2x1)(slice_base, 14, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlal_single2_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv8f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8f16(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmlal_single2_f16j13svfloat16x2_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv8f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8f16(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlal_single2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmlal,_single,_za32,_f16,_vg2x2)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlal_single2_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv8bf16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8bf16(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svmlal_single2_bf16j14svbfloat16x2_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv8bf16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8bf16(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlal_single2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmlal,_single,_za32,_bf16,_vg2x2)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlal_single2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.single.vg2x2.nxv8i16(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmlal_single2_u16j12svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.single.vg2x2.nxv8i16(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlal_single2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmlal,_single,_za32,_u16,_vg2x2)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlal_single2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.single.vg2x2.nxv8i16(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmlal_single2_s16j11svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.single.vg2x2.nxv8i16(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlal_single2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmlal,_single,_za32,_s16,_vg2x2)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlal_single4_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv8f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8f16(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmlal_single4_f16j13svfloat16x4_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv8f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8f16(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlal_single4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmlal,_single,_za32,_f16,_vg2x4)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlal_single4_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv8bf16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8bf16(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svmlal_single4_bf16j14svbfloat16x4_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv8bf16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8bf16(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlal_single4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmlal,_single,_za32,_bf16,_vg2x4)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlal_single4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.single.vg2x4.nxv8i16(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmlal_single4_u16j12svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.single.vg2x4.nxv8i16(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlal_single4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmlal,_single,_za32,_u16,_vg2x4)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlal_single4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.single.vg2x4.nxv8i16(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmlal_single4_s16j11svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.single.vg2x4.nxv8i16(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlal_single4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmlal,_single,_za32,_s16,_vg2x4)(slice_base, 6, zn, zm); +} + +// +// Multi, indexed +// + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlal_lane1_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv8f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv8f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmlal_lane1_f16ju13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv8f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv8f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmlal_lane1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmlal,_lane,_za32,_f16,_vg2x1)(slice_base, 14, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlal_lane1_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv8bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv8bf16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmlal_lane1_bf16ju14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv8bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv8bf16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmlal_lane1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmlal,_lane,_za32,_bf16,_vg2x1)(slice_base, 14, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlal_lane1_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.lane.vg2x1.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmlal_lane1_u16ju12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.lane.vg2x1.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmlal_lane1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmlal,_lane,_za32,_u16,_vg2x1)(slice_base, 14, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlal_lane1_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.lane.vg2x1.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmlal_lane1_s16ju11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.lane.vg2x1.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmlal_lane1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmlal,_lane,_za32,_s16,_vg2x1)(slice_base, 14, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlal_lane2_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv8f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8f16(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmlal_lane2_f16j13svfloat16x2_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv8f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8f16(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmlal_lane2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmlal,_lane,_za32,_f16,_vg2x2)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlal_lane2_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv8bf16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8bf16(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmlal_lane2_bf16j14svbfloat16x2_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv8bf16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8bf16(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmlal_lane2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmlal,_lane,_za32,_bf16,_vg2x2)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlal_lane2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.lane.vg2x2.nxv8i16(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmlal_lane2_u16j12svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.lane.vg2x2.nxv8i16(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmlal_lane2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmlal,_lane,_za32,_u16,_vg2x2)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlal_lane2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.lane.vg2x2.nxv8i16(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmlal_lane2_s16j11svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.lane.vg2x2.nxv8i16(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmlal_lane2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmlal,_lane,_za32,_s16,_vg2x2)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlal_lane4_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv8f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8f16(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmlal_lane4_f16j13svfloat16x4_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv8f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8f16(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmlal_lane4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmlal,_lane,_za32,_f16,_vg2x4)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlal_lane4_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv8bf16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8bf16(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmlal_lane4_bf16j14svbfloat16x4_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv8bf16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8bf16(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmlal_lane4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmlal,_lane,_za32,_bf16,_vg2x4)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlal_lane4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.lane.vg2x4.nxv8i16(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmlal_lane4_u16j12svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.lane.vg2x4.nxv8i16(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmlal_lane4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmlal,_lane,_za32,_u16,_vg2x4)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlal_lane4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.lane.vg2x4.nxv8i16(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmlal_lane4_s16j11svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.lane.vg2x4.nxv8i16(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmlal_lane4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmlal,_lane,_za32,_s16,_vg2x4)(slice_base, 6, zn, zm, 7); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c @@ -0,0 +1,348 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -D__ARM_FEATURE_SME -D__ARM_FEATURE_SME2 -D__ARM_FEATURE_SME_F64F64 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -mem2reg -instcombine -tailcallelim | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SME -D__ARM_FEATURE_SME2 -D__ARM_FEATURE_SME_F64F64 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -mem2reg -instcombine -tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -D__ARM_FEATURE_SME -D__ARM_FEATURE_SME2 -D__ARM_FEATURE_SME_F64F64 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -mem2reg -instcombine -tailcallelim | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SME -D__ARM_FEATURE_SME2 -D__ARM_FEATURE_SME_F64F64 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -mem2reg -instcombine -tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -D__ARM_FEATURE_SME -D__ARM_FEATURE_SME2 -D__ARM_FEATURE_SME_F64F64 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// +// Multi, single +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_multi2_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.multi.vg1x2.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmls_multi2_f32j13svfloat32x2_t13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.multi.vg1x2.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_multi2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32x2_t zm) { + SVE_ACLE_FUNC(svmls,,_za32,_f32,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_multi4_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.multi.vg1x4.nxv4f32(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmls_multi4_f32j13svfloat32x4_t13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.multi.vg1x4.nxv4f32(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_multi4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32x4_t zm) { + SVE_ACLE_FUNC(svmls,,_za32,_f32,_vg1x4)(slice_base, 7, zn, zm); +} + +// +// Multi, single +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single2_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv4f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv4f32(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmls_single2_f32j13svfloat32x2_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv4f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv4f32(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) { + SVE_ACLE_FUNC(svmls,_single,_za32,_f32,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single4_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv4f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv4f32(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmls_single4_f32j13svfloat32x4_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv4f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv4f32(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) { + SVE_ACLE_FUNC(svmls,_single,_za32,_f32,_vg1x4)(slice_base, 7, zn, zm); +} + +// +// Multi, indexed +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane2_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv4f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv4f32(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmls_lane2_f32j13svfloat32x2_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv4f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv4f32(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) { + SVE_ACLE_FUNC(svmls_lane,,_za32,_f32,_vg1x2)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane4_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv4f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv4f32(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmls_lane4_f32j13svfloat32x4_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv4f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv4f32(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) { + SVE_ACLE_FUNC(svmls_lane,,_za32,_f32,_vg1x4)(slice_base, 7, zn, zm, 3); +} + +// +// Multi, single +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_multi2_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.multi.vg1x2.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmls_multi2_f64j13svfloat64x2_t13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.multi.vg1x2.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_multi2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64x2_t zm) { + SVE_ACLE_FUNC(svmls,,_za64,_f64,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_multi4_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.multi.vg1x4.nxv2f64(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmls_multi4_f64j13svfloat64x4_t13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.multi.vg1x4.nxv2f64(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_multi4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64x4_t zm) { + SVE_ACLE_FUNC(svmls,,_za64,_f64,_vg1x4)(slice_base, 7, zn, zm); +} + +// +// Multi, single +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single2_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv2f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv2f64(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmls_single2_f64j13svfloat64x2_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv2f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv2f64(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) { + SVE_ACLE_FUNC(svmls,_single,_za64,_f64,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single4_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv2f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv2f64(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmls_single4_f64j13svfloat64x4_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv2f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv2f64(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) { + SVE_ACLE_FUNC(svmls,_single,_za64,_f64,_vg1x4)(slice_base, 7, zn, zm); +} + +// +// Multi, indexed +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane2_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv2f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv2f64(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]], i32 1) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmls_lane2_f64j13svfloat64x2_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv2f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv2f64(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]], i32 1) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) { + SVE_ACLE_FUNC(svmls_lane,,_za64,_f64,_vg1x2)(slice_base, 7, zn, zm, 1); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane4_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv2f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv2f64(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], i32 1) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmls_lane4_f64j13svfloat64x4_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv2f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv2f64(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], i32 1) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) { + SVE_ACLE_FUNC(svmls_lane,,_za64,_f64,_vg1x4)(slice_base, 7, zn, zm, 1); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c @@ -0,0 +1,856 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -D__ARM_FEATURE_SME -D__ARM_FEATURE_SME2 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -mem2reg -instcombine -tailcallelim | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SME -D__ARM_FEATURE_SME2 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -mem2reg -instcombine -tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -D__ARM_FEATURE_SME -D__ARM_FEATURE_SME2 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -mem2reg -instcombine -tailcallelim | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SME -D__ARM_FEATURE_SME2 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -mem2reg -instcombine -tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -D__ARM_FEATURE_SME -D__ARM_FEATURE_SME2 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// +// Multi, multi +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlsl_multi2_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.multi.vg2x2.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmlsl_multi2_f16j13svfloat16x2_t13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.multi.vg2x2.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlsl_multi2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16x2_t zm) +{ + SVE_ACLE_FUNC(svmlsl_za32,_f16,_vg2x2,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlsl_multi2_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.multi.vg2x2.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmlsl_multi2_bf16j14svbfloat16x2_t14svbfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.multi.vg2x2.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlsl_multi2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16x2_t zm) +{ + SVE_ACLE_FUNC(svmlsl_za32,_bf16,_vg2x2,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlsl_multi2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.multi.vg2x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmlsl_multi2_u16j12svuint16x2_t12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.multi.vg2x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlsl_multi2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) +{ + SVE_ACLE_FUNC(svmlsl_za32,_u16,_vg2x2,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlsl_multi2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.multi.vg2x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmlsl_multi2_s16j11svint16x2_t11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.multi.vg2x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlsl_multi2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) +{ + SVE_ACLE_FUNC(svmlsl_za32,_s16,_vg2x2,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlsl_multi4_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.multi.vg2x4.nxv8f16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmlsl_multi4_f16j13svfloat16x4_t13svfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.multi.vg2x4.nxv8f16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlsl_multi4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16x4_t zm) +{ + SVE_ACLE_FUNC(svmlsl_za32,_f16,_vg2x4,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlsl_multi4_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.multi.vg2x4.nxv8bf16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmlsl_multi4_bf16j14svbfloat16x4_t14svbfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.multi.vg2x4.nxv8bf16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlsl_multi4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16x4_t zm) +{ + SVE_ACLE_FUNC(svmlsl_za32,_bf16,_vg2x4,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlsl_multi4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.multi.vg2x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmlsl_multi4_u16j12svuint16x4_t12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.multi.vg2x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlsl_multi4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) +{ + SVE_ACLE_FUNC(svmlsl_za32,_u16,_vg2x4,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlsl_multi4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.multi.vg2x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmlsl_multi4_s16j11svint16x4_t11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.multi.vg2x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlsl_multi4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) +{ + SVE_ACLE_FUNC(svmlsl_za32,_s16,_vg2x4,,)(slice_base, 6, zn, zm); +} + +// +// Multi, single +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlsl_single1_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv8f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv8f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmlsl_single1_f16ju13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv8f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv8f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlsl_single1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmlsl,_single,_za32,_f16,_vg2x1)(slice_base, 14, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlsl_single1_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv8bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv8bf16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svmlsl_single1_bf16ju14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv8bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv8bf16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlsl_single1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmlsl,_single,_za32,_bf16,_vg2x1)(slice_base, 14, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlsl_single1_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.single.vg2x1.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmlsl_single1_u16ju12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.single.vg2x1.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlsl_single1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmlsl,_single,_za32,_u16,_vg2x1)(slice_base, 14, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlsl_single1_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.single.vg2x1.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmlsl_single1_s16ju11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.single.vg2x1.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlsl_single1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmlsl,_single,_za32,_s16,_vg2x1)(slice_base, 14, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlsl_single2_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv8f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8f16(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmlsl_single2_f16j13svfloat16x2_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv8f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8f16(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlsl_single2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmlsl,_single,_za32,_f16,_vg2x2)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlsl_single2_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv8bf16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8bf16(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svmlsl_single2_bf16j14svbfloat16x2_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv8bf16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8bf16(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlsl_single2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmlsl,_single,_za32,_bf16,_vg2x2)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlsl_single2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.single.vg2x2.nxv8i16(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmlsl_single2_u16j12svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.single.vg2x2.nxv8i16(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlsl_single2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmlsl,_single,_za32,_u16,_vg2x2)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlsl_single2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.single.vg2x2.nxv8i16(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmlsl_single2_s16j11svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.single.vg2x2.nxv8i16(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlsl_single2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmlsl,_single,_za32,_s16,_vg2x2)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlsl_single4_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv8f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8f16(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmlsl_single4_f16j13svfloat16x4_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv8f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8f16(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlsl_single4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmlsl,_single,_za32,_f16,_vg2x4)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlsl_single4_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv8bf16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8bf16(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svmlsl_single4_bf16j14svbfloat16x4_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv8bf16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8bf16(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlsl_single4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmlsl,_single,_za32,_bf16,_vg2x4)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlsl_single4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.single.vg2x4.nxv8i16(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmlsl_single4_u16j12svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.single.vg2x4.nxv8i16(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlsl_single4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmlsl,_single,_za32,_u16,_vg2x4)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlsl_single4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.single.vg2x4.nxv8i16(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmlsl_single4_s16j11svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.single.vg2x4.nxv8i16(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmlsl_single4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmlsl,_single,_za32,_s16,_vg2x4)(slice_base, 6, zn, zm); +} + +// +// Multi, indexed +// + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlsl_lane1_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv8f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv8f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmlsl_lane1_f16ju13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv8f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv8f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmlsl_lane1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmlsl,_lane,_za32,_f16,_vg2x1)(slice_base, 14, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlsl_lane1_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv8bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv8bf16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmlsl_lane1_bf16ju14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv8bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv8bf16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmlsl_lane1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmlsl,_lane,_za32,_bf16,_vg2x1)(slice_base, 14, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlsl_lane1_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.lane.vg2x1.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmlsl_lane1_u16ju12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.lane.vg2x1.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmlsl_lane1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmlsl,_lane,_za32,_u16,_vg2x1)(slice_base, 14, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlsl_lane1_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.lane.vg2x1.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmlsl_lane1_s16ju11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.lane.vg2x1.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmlsl_lane1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmlsl,_lane,_za32,_s16,_vg2x1)(slice_base, 14, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlsl_lane2_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv8f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8f16(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmlsl_lane2_f16j13svfloat16x2_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv8f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8f16(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmlsl_lane2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmlsl,_lane,_za32,_f16,_vg2x2)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlsl_lane2_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv8bf16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8bf16(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmlsl_lane2_bf16j14svbfloat16x2_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv8bf16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8bf16(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmlsl_lane2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmlsl,_lane,_za32,_bf16,_vg2x2)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlsl_lane2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.lane.vg2x2.nxv8i16(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmlsl_lane2_u16j12svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.lane.vg2x2.nxv8i16(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmlsl_lane2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmlsl,_lane,_za32,_u16,_vg2x2)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlsl_lane2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.lane.vg2x2.nxv8i16(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmlsl_lane2_s16j11svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.lane.vg2x2.nxv8i16(i32 [[TMP3]], [[TMP0]], [[TMP1]], [[TMP2]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmlsl_lane2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmlsl,_lane,_za32,_s16,_vg2x2)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlsl_lane4_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv8f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8f16(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmlsl_lane4_f16j13svfloat16x4_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv8f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8f16(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmlsl_lane4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmlsl,_lane,_za32,_f16,_vg2x4)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlsl_lane4_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv8bf16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8bf16(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmlsl_lane4_bf16j14svbfloat16x4_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv8bf16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8bf16(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmlsl_lane4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmlsl,_lane,_za32,_bf16,_vg2x4)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlsl_lane4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.lane.vg2x4.nxv8i16(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmlsl_lane4_u16j12svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.lane.vg2x4.nxv8i16(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmlsl_lane4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmlsl,_lane,_za32,_u16,_vg2x4)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmlsl_lane4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.lane.vg2x4.nxv8i16(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmlsl_lane4_s16j11svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv8i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.lane.vg2x4.nxv8i16(i32 [[TMP5]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmlsl_lane4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmlsl,_lane,_za32,_s16,_vg2x4)(slice_base, 6, zn, zm, 7); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c @@ -0,0 +1,1500 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -D__ARM_FEATURE_SME -D__ARM_FEATURE_SME2 -D__ARM_FEATURE_SME_F64F64 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -mem2reg -instcombine -tailcallelim | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SME -D__ARM_FEATURE_SME2 -D__ARM_FEATURE_SME_F64F64 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -mem2reg -instcombine -tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -D__ARM_FEATURE_SME -D__ARM_FEATURE_SME2 -D__ARM_FEATURE_SME_F64F64 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -mem2reg -instcombine -tailcallelim | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SME -D__ARM_FEATURE_SME2 -D__ARM_FEATURE_SME_F64F64 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -mem2reg -instcombine -tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -D__ARM_FEATURE_SME -D__ARM_FEATURE_SME2 -D__ARM_FEATURE_SME_F64F64 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za8_u8_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 14 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za8_u8_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 14 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint8x2_t test_svread_ver_za8_u8_vg2(uint32_t base) { + return svread_ver_za8_u8_vg2(base, 14); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za8_s8_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 14 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za8_s8_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 14 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint8x2_t test_svread_ver_za8_s8_vg2(uint32_t base) { + return svread_ver_za8_s8_vg2(base, 14); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za8_u8_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 14 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za8_u8_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 14 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint8x2_t test_svread_hor_za8_u8_vg2(uint32_t base) { + return svread_hor_za8_u8_vg2(base, 14); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za8_s8_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 14 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za8_s8_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 14 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint8x2_t test_svread_hor_za8_s8_vg2(uint32_t base) { + return svread_hor_za8_s8_vg2(base, 14); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za8_u8_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 12 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za8_u8_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 12 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint8x4_t test_svread_hor_za8_u8_vg4(uint32_t base) { + return svread_hor_za8_u8_vg4(base, 12); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za8_s8_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 12 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za8_s8_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 12 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint8x4_t test_svread_hor_za8_s8_vg4(uint32_t base) { + return svread_hor_za8_s8_vg4(base, 12); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za8_u8_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 12 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za8_u8_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 12 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint8x4_t test_svread_ver_za8_u8_vg4(uint32_t base) { + return svread_ver_za8_u8_vg4(base, 12); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za8_s8_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 12 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za8_s8_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 12 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint8x4_t test_svread_ver_za8_s8_vg4(uint32_t base) { + return svread_ver_za8_s8_vg4(base, 12); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_u16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za16_u16_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint16x2_t test_svread_hor_za16_u16_vg2(uint32_t base) { + return svread_hor_za16_u16_vg2(base, 6); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_bf16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv8bf16(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z29test_svread_hor_za16_bf16_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv8bf16(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svbfloat16x2_t test_svread_hor_za16_bf16_vg2(uint32_t base) { + return svread_hor_za16_bf16_vg2(base, 6); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_f16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv8f16(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za16_f16_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv8f16(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svfloat16x2_t test_svread_hor_za16_f16_vg2(uint32_t base) { + return svread_hor_za16_f16_vg2(base, 6); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_s16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za16_s16_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint16x2_t test_svread_hor_za16_s16_vg2(uint32_t base) { + return svread_hor_za16_s16_vg2(base, 6); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_u16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za16_u16_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint16x2_t test_svread_ver_za16_u16_vg2(uint32_t base) { + return svread_ver_za16_u16_vg2(base, 6); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_bf16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv8bf16(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z29test_svread_ver_za16_bf16_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv8bf16(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svbfloat16x2_t test_svread_ver_za16_bf16_vg2(uint32_t base) { + return svread_ver_za16_bf16_vg2(base, 6); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_f16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv8f16(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za16_f16_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv8f16(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svfloat16x2_t test_svread_ver_za16_f16_vg2(uint32_t base) { + return svread_ver_za16_f16_vg2(base, 6); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_s16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za16_s16_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint16x2_t test_svread_ver_za16_s16_vg2(uint32_t base) { + return svread_ver_za16_s16_vg2(base, 6); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_u16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za16_u16_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint16x4_t test_svread_hor_za16_u16_vg4(uint32_t base) { + return svread_hor_za16_u16_vg4(base, 4); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_bf16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv8bf16(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z29test_svread_hor_za16_bf16_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv8bf16(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svbfloat16x4_t test_svread_hor_za16_bf16_vg4(uint32_t base) { + return svread_hor_za16_bf16_vg4(base, 4); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_f16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv8f16(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za16_f16_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv8f16(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svfloat16x4_t test_svread_hor_za16_f16_vg4(uint32_t base) { + return svread_hor_za16_f16_vg4(base, 4); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_s16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za16_s16_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint16x4_t test_svread_hor_za16_s16_vg4(uint32_t base) { + return svread_hor_za16_s16_vg4(base, 4); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_u16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za16_u16_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint16x4_t test_svread_ver_za16_u16_vg4(uint32_t base) { + return svread_ver_za16_u16_vg4(base, 4); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_bf16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv8bf16(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z29test_svread_ver_za16_bf16_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv8bf16(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svbfloat16x4_t test_svread_ver_za16_bf16_vg4(uint32_t base) { + return svread_ver_za16_bf16_vg4(base, 4); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_f16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv8f16(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za16_f16_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv8f16(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svfloat16x4_t test_svread_ver_za16_f16_vg4(uint32_t base) { + return svread_ver_za16_f16_vg4(base, 4); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_s16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za16_s16_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint16x4_t test_svread_ver_za16_s16_vg4(uint32_t base) { + return svread_ver_za16_s16_vg4(base, 4); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za32_u32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za32_u32_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint32x2_t test_svread_hor_za32_u32_vg2(uint32_t base) { + return svread_hor_za32_u32_vg2(base, 2); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za32_f32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv4f32(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za32_f32_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv4f32(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svfloat32x2_t test_svread_hor_za32_f32_vg2(uint32_t base) { + return svread_hor_za32_f32_vg2(base, 2); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za32_s32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za32_s32_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint32x2_t test_svread_hor_za32_s32_vg2(uint32_t base) { + return svread_hor_za32_s32_vg2(base, 2); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za32_u32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za32_u32_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint32x2_t test_svread_ver_za32_u32_vg2(uint32_t base) { + return svread_ver_za32_u32_vg2(base, 2); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za32_f32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv4f32(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za32_f32_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv4f32(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svfloat32x2_t test_svread_ver_za32_f32_vg2(uint32_t base) { + return svread_ver_za32_f32_vg2(base, 2); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za32_s32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za32_s32_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint32x2_t test_svread_ver_za32_s32_vg2(uint32_t base) { + return svread_ver_za32_s32_vg2(base, 2); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za32_u32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv4i32(i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za32_u32_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv4i32(i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint32x4_t test_svread_hor_za32_u32_vg4(uint32_t base) { + return svread_hor_za32_u32_vg4(base, 0); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za32_f32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv4f32(i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za32_f32_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv4f32(i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat32x4_t test_svread_hor_za32_f32_vg4(uint32_t base) { + return svread_hor_za32_f32_vg4(base, 0); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za32_s32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv4i32(i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za32_s32_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv4i32(i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint32x4_t test_svread_hor_za32_s32_vg4(uint32_t base) { + return svread_hor_za32_s32_vg4(base, 0); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za32_u32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv4i32(i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za32_u32_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv4i32(i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint32x4_t test_svread_ver_za32_u32_vg4(uint32_t base) { + return svread_ver_za32_u32_vg4(base, 0); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za32_f32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv4f32(i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za32_f32_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv4f32(i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat32x4_t test_svread_ver_za32_f32_vg4(uint32_t base) { + return svread_ver_za32_f32_vg4(base, 0); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za32_s32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv4i32(i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za32_s32_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv4i32(i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint32x4_t test_svread_ver_za32_s32_vg4(uint32_t base) { + return svread_ver_za32_s32_vg4(base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za64_u64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv2i64(i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za64_u64_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv2i64(i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint64x2_t test_svread_hor_za64_u64_vg2(uint32_t base) { + return svread_hor_za64_u64_vg2(base, 0); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za64_f64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv2f64(i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za64_f64_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv2f64(i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat64x2_t test_svread_hor_za64_f64_vg2(uint32_t base) { + return svread_hor_za64_f64_vg2(base, 0); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za64_s64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv2i64(i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za64_s64_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv2i64(i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint64x2_t test_svread_hor_za64_s64_vg2(uint32_t base) { + return svread_hor_za64_s64_vg2(base, 0); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za64_u64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv2i64(i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za64_u64_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv2i64(i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint64x2_t test_svread_ver_za64_u64_vg2(uint32_t base) { + return svread_ver_za64_u64_vg2(base, 0); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za64_f64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv2f64(i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za64_f64_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv2f64(i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat64x2_t test_svread_ver_za64_f64_vg2(uint32_t base) { + return svread_ver_za64_f64_vg2(base, 0); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za64_s64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv2i64(i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za64_s64_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv2i64(i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint64x2_t test_svread_ver_za64_s64_vg2(uint32_t base) { + return svread_ver_za64_s64_vg2(base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za64_u64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv2i64(i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za64_u64_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv2i64(i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint64x4_t test_svread_hor_za64_u64_vg4(uint32_t base) { + return svread_hor_za64_u64_vg4(base, 0); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za64_f64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv2f64(i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za64_f64_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv2f64(i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat64x4_t test_svread_hor_za64_f64_vg4(uint32_t base) { + return svread_hor_za64_f64_vg4(base, 0); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za64_s64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv2i64(i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za64_s64_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv2i64(i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint64x4_t test_svread_hor_za64_s64_vg4(uint32_t base) { + return svread_hor_za64_s64_vg4(base, 0); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za64_u64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv2i64(i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za64_u64_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv2i64(i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint64x4_t test_svread_ver_za64_u64_vg4(uint32_t base) { + return svread_ver_za64_u64_vg4(base, 0); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za64_f64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv2f64(i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za64_f64_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv2f64(i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat64x4_t test_svread_ver_za64_f64_vg4(uint32_t base) { + return svread_ver_za64_f64_vg4(base, 0); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za64_s64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv2i64(i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za64_s64_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv2i64(i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint64x4_t test_svread_ver_za64_s64_vg4(uint32_t base) { + return svread_ver_za64_s64_vg4(base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_za64_u64_vg1x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_za64_u64_vg1x2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint64x2_t test_svread_za64_u64_vg1x2(uint32_t base) { + return svread_za64_u64_vg1x2(base, 7); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_za64_f64_vg1x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.vg1x2.nxv2f64(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_za64_f64_vg1x2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.vg1x2.nxv2f64(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svfloat64x2_t test_svread_za64_f64_vg1x2(uint32_t base) { + return svread_za64_f64_vg1x2(base, 7); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_za64_s64_vg1x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_za64_s64_vg1x2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint64x2_t test_svread_za64_s64_vg1x2(uint32_t base) { + return svread_za64_s64_vg1x2(base, 7); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_za64_u64_vg1x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP7]], [[TMP8]], i64 6) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_za64_u64_vg1x4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP7]], [[TMP8]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint64x4_t test_svread_za64_u64_vg1x4(uint32_t base) { + return svread_za64_u64_vg1x4(base, 7); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_za64_f64_vg1x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.vg1x4.nxv2f64(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP5]], [[TMP6]], i64 4) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP7]], [[TMP8]], i64 6) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_za64_f64_vg1x4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.vg1x4.nxv2f64(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP5]], [[TMP6]], i64 4) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP7]], [[TMP8]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svfloat64x4_t test_svread_za64_f64_vg1x4(uint32_t base) { + return svread_za64_f64_vg1x4(base, 7); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_za64_s64_vg1x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP7]], [[TMP8]], i64 6) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_za64_s64_vg1x4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP7]], [[TMP8]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint64x4_t test_svread_za64_s64_vg1x4(uint32_t base) { + return svread_za64_s64_vg1x4(base, 7); +} + diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_write.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_write.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_write.c @@ -0,0 +1,1176 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -D__ARM_FEATURE_SME -D__ARM_FEATURE_SME2 -D__ARM_FEATURE_SME_F64F64 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -mem2reg -instcombine -tailcallelim | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SME -D__ARM_FEATURE_SME2 -D__ARM_FEATURE_SME_F64F64 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -mem2reg -instcombine -tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -D__ARM_FEATURE_SME -D__ARM_FEATURE_SME2 -D__ARM_FEATURE_SME_F64F64 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -mem2reg -instcombine -tailcallelim | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SME -D__ARM_FEATURE_SME2 -D__ARM_FEATURE_SME_F64F64 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -mem2reg -instcombine -tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -D__ARM_FEATURE_SME -D__ARM_FEATURE_SME2 -D__ARM_FEATURE_SME_F64F64 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za8_u8_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za8_u8_vg2j11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za8_u8_vg2(uint32_t base, svuint8x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za8,_u8,_vg2,)(base, 14, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za8_s8_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za8_s8_vg2j10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za8_s8_vg2(uint32_t base, svint8x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za8,_s8,_vg2,)(base, 14, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za8_u8_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za8_u8_vg2j11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za8_u8_vg2(uint32_t base, svuint8x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za8,_u8,_vg2,)(base, 14, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za8_s8_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za8_s8_vg2j10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za8_s8_vg2(uint32_t base, svint8x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za8,_s8,_vg2,)(base, 14, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za8_u8_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za8_u8_vg4j11svuint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za8_u8_vg4(uint32_t base, svuint8x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za8,_u8,_vg4,)(base, 12, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za8_s8_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za8_s8_vg4j10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za8_s8_vg4(uint32_t base, svint8x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za8,_s8,_vg4,)(base, 12, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za8_u8_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za8_u8_vg4j11svuint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za8_u8_vg4(uint32_t base, svuint8x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za8,_u8,_vg4,)(base, 12, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za8_s8_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za8_s8_vg4j10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za8_s8_vg4(uint32_t base, svint8x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za8,_s8,_vg4,)(base, 12, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_u16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za16_u16_vg2j12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_u16_vg2(uint32_t base, svuint16x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za16,_u16,_vg2,)(base, 6, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_bf16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svwrite_hor_za16_bf16_vg2j14svbfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_bf16_vg2(uint32_t base, svbfloat16x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za16,_bf16,_vg2,)(base, 6, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_f16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za16_f16_vg2j13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_f16_vg2(uint32_t base, svfloat16x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za16,_f16,_vg2,)(base, 6, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_s16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za16_s16_vg2j11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_s16_vg2(uint32_t base, svint16x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za16,_s16,_vg2,)(base, 6, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_u16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za16_u16_vg2j12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_u16_vg2(uint32_t base, svuint16x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za16,_u16,_vg2,)(base, 6, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_bf16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svwrite_ver_za16_bf16_vg2j14svbfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_bf16_vg2(uint32_t base, svbfloat16x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za16,_bf16,_vg2,)(base, 6, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_f16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za16_f16_vg2j13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_f16_vg2(uint32_t base, svfloat16x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za16,_f16,_vg2,)(base, 6, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_s16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za16_s16_vg2j11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_s16_vg2(uint32_t base, svint16x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za16,_s16,_vg2,)(base, 6, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_u16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za16_u16_vg4j12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_u16_vg4(uint32_t base, svuint16x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za16,_u16,_vg4,)(base, 4, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_bf16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svwrite_hor_za16_bf16_vg4j14svbfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_bf16_vg4(uint32_t base, svbfloat16x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za16,_bf16,_vg4,)(base, 4, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_f16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za16_f16_vg4j13svfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_f16_vg4(uint32_t base, svfloat16x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za16,_f16,_vg4,)(base, 4, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_s16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za16_s16_vg4j11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_s16_vg4(uint32_t base, svint16x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za16,_s16,_vg4,)(base, 4, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_u16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za16_u16_vg4j12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_u16_vg4(uint32_t base, svuint16x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za16,_u16,_vg4,)(base, 4, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_bf16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svwrite_ver_za16_bf16_vg4j14svbfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_bf16_vg4(uint32_t base, svbfloat16x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za16,_bf16,_vg4,)(base, 4, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_f16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za16_f16_vg4j13svfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_f16_vg4(uint32_t base, svfloat16x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za16,_f16,_vg4,)(base, 4, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_s16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za16_s16_vg4j11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_s16_vg4(uint32_t base, svint16x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za16,_s16,_vg4,)(base, 4, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za32_u32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv4i32(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za32_u32_vg2j12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv4i32(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za32_u32_vg2(uint32_t base, svuint32x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za32,_u32,_vg2,)(base, 2, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za32_f32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv4f32(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za32_f32_vg2j13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv4f32(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za32_f32_vg2(uint32_t base, svfloat32x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za32,_f32,_vg2,)(base, 2, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za32_s32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv4i32(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za32_s32_vg2j11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv4i32(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za32_s32_vg2(uint32_t base, svint32x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za32,_s32,_vg2,)(base, 2, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za32_u32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv4i32(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za32_u32_vg2j12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv4i32(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za32_u32_vg2(uint32_t base, svuint32x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za32,_u32,_vg2,)(base, 2, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za32_f32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv4f32(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za32_f32_vg2j13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv4f32(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za32_f32_vg2(uint32_t base, svfloat32x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za32,_f32,_vg2,)(base, 2, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za32_s32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv4i32(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za32_s32_vg2j11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv4i32(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za32_s32_vg2(uint32_t base, svint32x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za32,_s32,_vg2,)(base, 2, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za32_u32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv4i32(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za32_u32_vg4j12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv4i32(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za32_u32_vg4(uint32_t base, svuint32x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za32,_u32,_vg4,)(base, 0, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za32_f32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv4f32(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za32_f32_vg4j13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv4f32(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za32_f32_vg4(uint32_t base, svfloat32x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za32,_f32,_vg4,)(base, 0, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za32_s32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv4i32(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za32_s32_vg4j11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv4i32(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za32_s32_vg4(uint32_t base, svint32x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za32,_s32,_vg4,)(base, 0, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za32_u32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv4i32(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za32_u32_vg4j12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv4i32(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za32_u32_vg4(uint32_t base, svuint32x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za32,_u32,_vg4,)(base, 0, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za32_f32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv4f32(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za32_f32_vg4j13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv4f32(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za32_f32_vg4(uint32_t base, svfloat32x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za32,_f32,_vg4,)(base, 0, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za32_s32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv4i32(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za32_s32_vg4j11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv4i32(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za32_s32_vg4(uint32_t base, svint32x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za32,_s32,_vg4,)(base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za64_u64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv2i64(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za64_u64_vg2j12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv2i64(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za64_u64_vg2(uint32_t base, svuint64x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za64,_u64,_vg2,)(base, 0, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za64_f64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv2f64(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za64_f64_vg2j13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv2f64(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za64_f64_vg2(uint32_t base, svfloat64x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za64,_f64,_vg2,)(base, 0, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za64_s64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv2i64(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za64_s64_vg2j11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv2i64(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za64_s64_vg2(uint32_t base, svint64x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za64,_s64,_vg2,)(base, 0, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za64_u64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv2i64(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za64_u64_vg2j12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv2i64(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za64_u64_vg2(uint32_t base, svuint64x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za64,_u64,_vg2,)(base, 0, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za64_f64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv2f64(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za64_f64_vg2j13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv2f64(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za64_f64_vg2(uint32_t base, svfloat64x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za64,_f64,_vg2,)(base, 0, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za64_s64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv2i64(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za64_s64_vg2j11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv2i64(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za64_s64_vg2(uint32_t base, svint64x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za64,_s64,_vg2,)(base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za64_u64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv2i64(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za64_u64_vg4j12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv2i64(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za64_u64_vg4(uint32_t base, svuint64x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za64,_u64,_vg4,)(base, 0, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za64_f64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv2f64(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za64_f64_vg4j13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv2f64(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za64_f64_vg4(uint32_t base, svfloat64x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za64,_f64,_vg4,)(base, 0, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za64_s64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv2i64(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za64_s64_vg4j11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv2i64(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za64_s64_vg4(uint32_t base, svint64x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za64,_s64,_vg4,)(base, 0, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za64_u64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv2i64(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za64_u64_vg4j12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv2i64(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za64_u64_vg4(uint32_t base, svuint64x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za64,_u64,_vg4,)(base, 0, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za64_f64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv2f64(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za64_f64_vg4j13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv2f64(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za64_f64_vg4(uint32_t base, svfloat64x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za64,_f64,_vg4,)(base, 0, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za64_s64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv2i64(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za64_s64_vg4j11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv2i64(i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za64_s64_vg4(uint32_t base, svint64x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za64,_s64,_vg4,)(base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_za64_u64_vg1x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_za64_u64_vg1x2j12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_za64_u64_vg1x2(uint32_t base, svuint64x2_t val) { + SVE_ACLE_FUNC(svwrite_za64,_u64,_vg1x2,)(base, 7, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_za64_f64_vg1x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x2.nxv2f64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_za64_f64_vg1x2j13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x2.nxv2f64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_za64_f64_vg1x2(uint32_t base, svfloat64x2_t val) { + SVE_ACLE_FUNC(svwrite_za64,_f64,_vg1x2,)(base, 7, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_za64_s64_vg1x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_za64_s64_vg1x2j11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_za64_s64_vg1x2(uint32_t base, svint64x2_t val) { + SVE_ACLE_FUNC(svwrite_za64,_s64,_vg1x2,)(base, 7, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_za64_u64_vg1x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_za64_u64_vg1x4j12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_za64_u64_vg1x4(uint32_t base, svuint64x4_t val) { + SVE_ACLE_FUNC(svwrite_za64,_u64,_vg1x4,)(base, 7, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_za64_f64_vg1x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x4.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_za64_f64_vg1x4j13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x4.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_za64_f64_vg1x4(uint32_t base, svfloat64x4_t val) { + SVE_ACLE_FUNC(svwrite_za64,_f64,_vg1x4,)(base, 7, val); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_za64_s64_vg1x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_za64_s64_vg1x4j11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_za64_s64_vg1x4(uint32_t base, svint64x4_t val) { + SVE_ACLE_FUNC(svwrite_za64,_s64,_vg1x4,)(base, 7, val); +} diff --git a/clang/test/CodeGen/aarch64-sve-inline-asm-crash.c b/clang/test/CodeGen/aarch64-sve-inline-asm-crash.c --- a/clang/test/CodeGen/aarch64-sve-inline-asm-crash.c +++ b/clang/test/CodeGen/aarch64-sve-inline-asm-crash.c @@ -20,5 +20,17 @@ return ret ; } +__SVCount_t funcB1(__SVCount_t in) +{ + __SVCount_t ret ; + asm volatile ( + "mov %[ret].b, %[in].b \n" + : [ret] "=w" (ret) + : [in] "w" (in) + :); + + return ret ; +} + // CHECK: funcB1 // CHECK-ERROR: fatal error: error in backend: Cannot select diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_psel.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_psel.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_psel.c @@ -0,0 +1,82 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK + +#include + +// CHECK-LABEL: @test_svpsel_b8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.psel.nxv16i1( [[P1:%.*]], [[P2:%.*]], i32 [[IDX:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z14test_svpsel_b8u10__SVBool_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.psel.nxv16i1( [[P1:%.*]], [[P2:%.*]], i32 [[IDX:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbool_t test_svpsel_b8(svbool_t p1, svbool_t p2, uint32_t idx) { + return svpsel_b8(p1, p2, idx); +} + +// CHECK-LABEL: @test_svpsel_b16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[P1:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[P2:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.psel.nxv8i1( [[TMP0]], [[TMP1]], i32 [[IDX:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP2]]) +// CHECK-NEXT: ret [[TMP3]] +// +// CPP-CHECK-LABEL: @_Z15test_svpsel_b16u10__SVBool_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[P1:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[P2:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.psel.nxv8i1( [[TMP0]], [[TMP1]], i32 [[IDX:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP2]]) +// CPP-CHECK-NEXT: ret [[TMP3]] +// +svbool_t test_svpsel_b16(svbool_t p1, svbool_t p2, uint32_t idx) { + return svpsel_b16(p1, p2, idx); +} + +// CHECK-LABEL: @test_svpsel_b32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[P1:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[P2:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.psel.nxv4i1( [[TMP0]], [[TMP1]], i32 [[IDX:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP2]]) +// CHECK-NEXT: ret [[TMP3]] +// +// CPP-CHECK-LABEL: @_Z15test_svpsel_b32u10__SVBool_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[P1:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[P2:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.psel.nxv4i1( [[TMP0]], [[TMP1]], i32 [[IDX:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP2]]) +// CPP-CHECK-NEXT: ret [[TMP3]] +// +svbool_t test_svpsel_b32(svbool_t p1, svbool_t p2, uint32_t idx) { + return svpsel_b32(p1, p2, idx); +} + +// CHECK-LABEL: @test_svpsel_b64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[P1:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[P2:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.psel.nxv2i1( [[TMP0]], [[TMP1]], i32 [[IDX:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP2]]) +// CHECK-NEXT: ret [[TMP3]] +// +// CPP-CHECK-LABEL: @_Z15test_svpsel_b64u10__SVBool_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[P1:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[P2:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.psel.nxv2i1( [[TMP0]], [[TMP1]], i32 [[IDX:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP2]]) +// CPP-CHECK-NEXT: ret [[TMP3]] +// +svbool_t test_svpsel_b64(svbool_t p1, svbool_t p2, uint32_t idx) { + return svpsel_b64(p1, p2, idx); +} diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_revd.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_revd.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_revd.c @@ -0,0 +1,390 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svrevd_s8_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( zeroinitializer, [[PG:%.*]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svrevd_s8_zu10__SVBool_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( zeroinitializer, [[PG:%.*]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint8_t test_svrevd_s8_z(svbool_t pg, svint8_t op) { + return SVE_ACLE_FUNC(svrevd, _s8, _z, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_s16_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_s16_zu10__SVBool_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint16_t test_svrevd_s16_z(svbool_t pg, svint16_t op) { + return SVE_ACLE_FUNC(svrevd, _s16, _z, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_s32_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_s32_zu10__SVBool_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint32_t test_svrevd_s32_z(svbool_t pg, svint32_t op) { + return SVE_ACLE_FUNC(svrevd, _s32, _z, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_s64_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_s64_zu10__SVBool_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint64_t test_svrevd_s64_z(svbool_t pg, svint64_t op) { + return SVE_ACLE_FUNC(svrevd, _s64, _z, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_u8_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( zeroinitializer, [[PG:%.*]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svrevd_u8_zu10__SVBool_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( zeroinitializer, [[PG:%.*]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint8_t test_svrevd_u8_z(svbool_t pg, svuint8_t op) { + return SVE_ACLE_FUNC(svrevd, _u8, _z, )(pg, op); +} +// CHECK-LABEL: @test_svrevd_u16_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_u16_zu10__SVBool_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint16_t test_svrevd_u16_z(svbool_t pg, svuint16_t op) { + return SVE_ACLE_FUNC(svrevd, _u16, _z, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_u32_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_u32_zu10__SVBool_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint32_t test_svrevd_u32_z(svbool_t pg, svuint32_t op) { + return SVE_ACLE_FUNC(svrevd, _u32, _z, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_u64_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_u64_zu10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint64_t test_svrevd_u64_z(svbool_t pg, svuint64_t op) { + return SVE_ACLE_FUNC(svrevd, _u64, _z, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_s8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( [[INACTIVE:%.*]], [[PG:%.*]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svrevd_s8_mu10__SVInt8_tu10__SVBool_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( [[INACTIVE:%.*]], [[PG:%.*]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint8_t test_svrevd_s8_m(svint8_t inactive, svbool_t pg, svint8_t op) { + return SVE_ACLE_FUNC(svrevd, _s8, _m, )(inactive, pg, op); +} + +// CHECK-LABEL: @test_svrevd_s16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_s16_mu11__SVInt16_tu10__SVBool_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint16_t test_svrevd_s16_m(svint16_t inactive, svbool_t pg, svint16_t op) { + return SVE_ACLE_FUNC(svrevd, _s16, _m, )(inactive, pg, op); +} + +// CHECK-LABEL: @test_svrevd_s32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_s32_mu11__SVInt32_tu10__SVBool_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint32_t test_svrevd_s32_m(svint32_t inactive, svbool_t pg, svint32_t op) { + return SVE_ACLE_FUNC(svrevd, _s32, _m, )(inactive, pg, op); +} + +// CHECK-LABEL: @test_svrevd_s64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_s64_mu11__SVInt64_tu10__SVBool_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint64_t test_svrevd_s64_m(svint64_t inactive, svbool_t pg, svint64_t op) { + return SVE_ACLE_FUNC(svrevd, _s64, _m, )(inactive, pg, op); +} + +// CHECK-LABEL: @test_svrevd_u8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( [[INACTIVE:%.*]], [[PG:%.*]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svrevd_u8_mu11__SVUint8_tu10__SVBool_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( [[INACTIVE:%.*]], [[PG:%.*]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint8_t test_svrevd_u8_m(svuint8_t inactive, svbool_t pg, svuint8_t op) { + return SVE_ACLE_FUNC(svrevd, _u8, _m, )(inactive, pg, op); +} + +// CHECK-LABEL: @test_svrevd_u16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_u16_mu12__SVUint16_tu10__SVBool_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint16_t test_svrevd_u16_m(svuint16_t inactive, svbool_t pg, svuint16_t op) { + return SVE_ACLE_FUNC(svrevd, _u16, _m, )(inactive, pg, op); +} + +// CHECK-LABEL: @test_svrevd_u32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_u32_mu12__SVUint32_tu10__SVBool_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint32_t test_svrevd_u32_m(svuint32_t inactive, svbool_t pg, svuint32_t op) { + return SVE_ACLE_FUNC(svrevd, _u32, _m, )(inactive, pg, op); +} + +// CHECK-LABEL: @test_svrevd_u64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_u64_mu12__SVUint64_tu10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint64_t test_svrevd_u64_m(svuint64_t inactive, svbool_t pg, svuint64_t op) { + return SVE_ACLE_FUNC(svrevd, _u64, _m, )(inactive, pg, op); +} + +// CHECK-LABEL: @test_svrevd_s8_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( undef, [[PG:%.*]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svrevd_s8_xu10__SVBool_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( undef, [[PG:%.*]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint8_t test_svrevd_s8_x(svbool_t pg, svint8_t op) { + return SVE_ACLE_FUNC(svrevd, _s8, _x, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_s16_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( undef, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_s16_xu10__SVBool_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( undef, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint16_t test_svrevd_s16_x(svbool_t pg, svint16_t op) { + return SVE_ACLE_FUNC(svrevd, _s16, _x, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_s32_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( undef, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_s32_xu10__SVBool_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( undef, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint32_t test_svrevd_s32_x(svbool_t pg, svint32_t op) { + return SVE_ACLE_FUNC(svrevd, _s32, _x, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_s64_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( undef, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_s64_xu10__SVBool_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( undef, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint64_t test_svrevd_s64_x(svbool_t pg, svint64_t op) { + return SVE_ACLE_FUNC(svrevd, _s64, _x, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_u8_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( undef, [[PG:%.*]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svrevd_u8_xu10__SVBool_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( undef, [[PG:%.*]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint8_t test_svrevd_u8_x(svbool_t pg, svuint8_t op) { + return SVE_ACLE_FUNC(svrevd, _u8, _x, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_u16_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( undef, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_u16_xu10__SVBool_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( undef, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint16_t test_svrevd_u16_x(svbool_t pg, svuint16_t op) { + return SVE_ACLE_FUNC(svrevd, _u16, _x, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_u32_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( undef, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_u32_xu10__SVBool_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( undef, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint32_t test_svrevd_u32_x(svbool_t pg, svuint32_t op) { + return SVE_ACLE_FUNC(svrevd, _u32, _x, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_u64_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( undef, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_u64_xu10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( undef, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint64_t test_svrevd_u64_x(svbool_t pg, svuint64_t op) { + return SVE_ACLE_FUNC(svrevd, _u64, _x, )(pg, op); +} diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sclamp.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sclamp.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sclamp.c @@ -0,0 +1,75 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svsclamp_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.sclamp.nxv16i8( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svsclamp_s8u10__SVInt8_tu10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.sclamp.nxv16i8( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint8_t test_svsclamp_s8(svint8_t op1, svint8_t op2, svint8_t op3) { + return SVE_ACLE_FUNC(svsclamp, _s8, , )(op1, op2, op3); +} + +// CHECK-LABEL: @test_svsclamp_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.sclamp.nxv8i16( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z17test_svsclamp_s16u11__SVInt16_tu11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.sclamp.nxv8i16( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint16_t test_svsclamp_s16(svint16_t op1, svint16_t op2, svint16_t op3) { + return SVE_ACLE_FUNC(svsclamp, _s16, , )(op1, op2, op3); +} + +// CHECK-LABEL: @test_svsclamp_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.sclamp.nxv4i32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z17test_svsclamp_s32u11__SVInt32_tu11__SVInt32_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.sclamp.nxv4i32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint32_t test_svsclamp_s32(svint32_t op1, svint32_t op2, svint32_t op3) { + return SVE_ACLE_FUNC(svsclamp, _s32, , )(op1, op2, op3); +} + +// CHECK-LABEL: @test_svsclamp_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.sclamp.nxv2i64( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z17test_svsclamp_s64u11__SVInt64_tu11__SVInt64_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.sclamp.nxv2i64( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint64_t test_svsclamp_s64(svint64_t op1, svint64_t op2, svint64_t op3) { + return SVE_ACLE_FUNC(svsclamp, _s64, , )(op1, op2, op3); +} diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_uclamp.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_uclamp.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_uclamp.c @@ -0,0 +1,75 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -D__ARM_FEATURE_SME=1 -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svuclamp_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uclamp.nxv16i8( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svuclamp_u8u11__SVUint8_tu11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uclamp.nxv16i8( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint8_t test_svuclamp_u8(svuint8_t op1, svuint8_t op2, svuint8_t op3) { + return SVE_ACLE_FUNC(svuclamp, _u8, , )(op1, op2, op3); +} + +// CHECK-LABEL: @test_svuclamp_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uclamp.nxv8i16( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z17test_svuclamp_u16u12__SVUint16_tu12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uclamp.nxv8i16( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint16_t test_svuclamp_u16(svuint16_t op1, svuint16_t op2, svuint16_t op3) { + return SVE_ACLE_FUNC(svuclamp, _u16, , )(op1, op2, op3); +} + +// CHECK-LABEL: @test_svuclamp_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uclamp.nxv4i32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z17test_svuclamp_u32u12__SVUint32_tu12__SVUint32_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uclamp.nxv4i32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint32_t test_svuclamp_u32(svuint32_t op1, svuint32_t op2, svuint32_t op3) { + return SVE_ACLE_FUNC(svuclamp, _u32, , )(op1, op2, op3); +} + +// CHECK-LABEL: @test_svuclamp_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uclamp.nxv2i64( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z17test_svuclamp_u64u12__SVUint64_tu12__SVUint64_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uclamp.nxv2i64( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint64_t test_svuclamp_u64(svuint64_t op1, svuint64_t op2, svuint64_t op3) { + return SVE_ACLE_FUNC(svuclamp, _u64, , )(op1, op2, op3); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1.c @@ -0,0 +1,1294 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -D__ARM_FEATURE_SME2 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SME2 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -D__ARM_FEATURE_SME2 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SME2 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv16i8(aarch64_svcount [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z16test_svld1_u8_x2u11__SVCount_tPKh( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv16i8(aarch64_svcount [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint8x2_t test_svld1_u8_x2(svcount_t pn, const uint8_t *base) +{ + return SVE_ACLE_FUNC(svld1b,_u8,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv8i16(aarch64_svcount [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_u16_x2u11__SVCount_tPKt( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv8i16(aarch64_svcount [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint16x2_t test_svld1_u16_x2(svcount_t pn, const uint16_t *base) +{ + return SVE_ACLE_FUNC(svld1h,_u16,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv4i32(aarch64_svcount [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_u32_x2u11__SVCount_tPKj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv4i32(aarch64_svcount [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint32x2_t test_svld1_u32_x2(svcount_t pn, const uint32_t *base) +{ + return SVE_ACLE_FUNC(svld1w,_u32,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv2i64(aarch64_svcount [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_u64_x2u11__SVCount_tPKm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv2i64(aarch64_svcount [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint64x2_t test_svld1_u64_x2(svcount_t pn, const uint64_t *base) +{ + return SVE_ACLE_FUNC(svld1d,_u64,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv16i8(aarch64_svcount [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z16test_svld1_u8_x4u11__SVCount_tPKh( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv16i8(aarch64_svcount [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint8x4_t test_svld1_u8_x4(svcount_t pn, const uint8_t *base) +{ + return SVE_ACLE_FUNC(svld1b,_u8,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv8i16(aarch64_svcount [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_u16_x4u11__SVCount_tPKt( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv8i16(aarch64_svcount [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint16x4_t test_svld1_u16_x4(svcount_t pn, const uint16_t *base) +{ + return SVE_ACLE_FUNC(svld1h,_u16,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv4i32(aarch64_svcount [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_u32_x4u11__SVCount_tPKj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv4i32(aarch64_svcount [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint32x4_t test_svld1_u32_x4(svcount_t pn, const uint32_t *base) +{ + return SVE_ACLE_FUNC(svld1w,_u32,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv2i64(aarch64_svcount [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_u64_x4u11__SVCount_tPKm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv2i64(aarch64_svcount [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint64x4_t test_svld1_u64_x4(svcount_t pn, const uint64_t *base) +{ + return SVE_ACLE_FUNC(svld1d,_u64,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv16i8(aarch64_svcount [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z16test_svld1_s8_x2u11__SVCount_tPKa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv16i8(aarch64_svcount [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint8x2_t test_svld1_s8_x2(svcount_t pn, const int8_t *base) +{ + return SVE_ACLE_FUNC(svld1b,_s8,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv8i16(aarch64_svcount [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_s16_x2u11__SVCount_tPKs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv8i16(aarch64_svcount [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint16x2_t test_svld1_s16_x2(svcount_t pn, const int16_t *base) +{ + return SVE_ACLE_FUNC(svld1h,_s16,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv4i32(aarch64_svcount [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_s32_x2u11__SVCount_tPKi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv4i32(aarch64_svcount [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint32x2_t test_svld1_s32_x2(svcount_t pn, const int32_t *base) +{ + return SVE_ACLE_FUNC(svld1w,_s32,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv2i64(aarch64_svcount [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_s64_x2u11__SVCount_tPKl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.vg2.nxv2i64(aarch64_svcount [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint64x2_t test_svld1_s64_x2(svcount_t pn, const int64_t *base) +{ + return SVE_ACLE_FUNC(svld1d,_s64,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv16i8(aarch64_svcount [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z16test_svld1_s8_x4u11__SVCount_tPKa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv16i8(aarch64_svcount [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint8x4_t test_svld1_s8_x4(svcount_t pn, const int8_t *base) +{ + return SVE_ACLE_FUNC(svld1b,_s8,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv8i16(aarch64_svcount [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_s16_x4u11__SVCount_tPKs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv8i16(aarch64_svcount [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint16x4_t test_svld1_s16_x4(svcount_t pn, const int16_t *base) +{ + return SVE_ACLE_FUNC(svld1h,_s16,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv4i32(aarch64_svcount [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_s32_x4u11__SVCount_tPKi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.vg4.nxv4i32(aarch64_svcount [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint32x4_t test_svld1_s32_x4(svcount_t pn, const int32_t *base) +{ + return SVE_ACLE_FUNC(svld1w,_s32,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.