Index: clang/include/clang/Basic/TargetBuiltins.h =================================================================== --- clang/include/clang/Basic/TargetBuiltins.h +++ clang/include/clang/Basic/TargetBuiltins.h @@ -27,6 +27,7 @@ enum { LastTIBuiltin = clang::Builtin::FirstTSBuiltin - 1, #define BUILTIN(ID, TYPE, ATTRS) BI##ID, +#define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) BI##ID, #include "clang/Basic/BuiltinsNEON.def" FirstTSBuiltin }; Index: clang/include/clang/Basic/arm_neon.td =================================================================== --- clang/include/clang/Basic/arm_neon.td +++ clang/include/clang/Basic/arm_neon.td @@ -289,7 +289,7 @@ "UcUsUicsilPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlhdQhQdPlQPl"> { let isLaneQ = 1; } -let ArchGuard = "defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC)" in { +let TargetGuard = "bf16" in { def SPLAT_BF : WInst<"splat_lane", ".(!q)I", "bQb">; def SPLATQ_BF : WInst<"splat_laneq", ".(!Q)I", "bQb"> { let isLaneQ = 1; @@ -1120,14 +1120,14 @@ //////////////////////////////////////////////////////////////////////////////// // Crypto -let ArchGuard = "__ARM_ARCH >= 8 && defined(__ARM_FEATURE_AES)" in { +let ArchGuard = "__ARM_ARCH >= 8", TargetGuard = "aes" in { def AESE : SInst<"vaese", "...", "QUc">; def AESD : SInst<"vaesd", "...", "QUc">; def AESMC : SInst<"vaesmc", "..", "QUc">; def AESIMC : SInst<"vaesimc", "..", "QUc">; } -let ArchGuard = "__ARM_ARCH >= 8 && defined(__ARM_FEATURE_SHA2)" in { +let ArchGuard = "__ARM_ARCH >= 8", TargetGuard = "sha2" in { def SHA1H : SInst<"vsha1h", "11", "Ui">; def SHA1SU1 : SInst<"vsha1su1", "...", "QUi">; def SHA256SU0 : SInst<"vsha256su0", "...", "QUi">; @@ -1141,7 +1141,7 @@ def SHA256SU1 : SInst<"vsha256su1", "....", "QUi">; } -let ArchGuard = "defined(__ARM_FEATURE_SHA3) && defined(__aarch64__)" in { +let ArchGuard = "defined(__aarch64__)", TargetGuard = "sha3" in { def BCAX : SInst<"vbcax", "....", "QUcQUsQUiQUlQcQsQiQl">; def EOR3 : SInst<"veor3", "....", "QUcQUsQUiQUlQcQsQiQl">; def RAX1 : SInst<"vrax1", "...", "QUl">; @@ -1151,15 +1151,14 @@ } } -let ArchGuard = "defined(__ARM_FEATURE_SHA512) && defined(__aarch64__)" in { - +let ArchGuard = "defined(__aarch64__)", TargetGuard = "sha3" in { def SHA512SU0 : SInst<"vsha512su0", "...", "QUl">; def SHA512su1 : SInst<"vsha512su1", "....", "QUl">; def SHA512H : SInst<"vsha512h", "....", "QUl">; def SHA512H2 : SInst<"vsha512h2", "....", "QUl">; } -let ArchGuard = "defined(__ARM_FEATURE_SM3) && defined(__aarch64__)" in { +let ArchGuard = "defined(__aarch64__)", TargetGuard = "sm4" in { def SM3SS1 : SInst<"vsm3ss1", "....", "QUi">; def SM3TT1A : SInst<"vsm3tt1a", "....I", "QUi">; def SM3TT1B : SInst<"vsm3tt1b", "....I", "QUi">; @@ -1169,7 +1168,7 @@ def SM3PARTW2 : SInst<"vsm3partw2", "....", "QUi">; } -let ArchGuard = "defined(__ARM_FEATURE_SM4) && defined(__aarch64__)" in { +let ArchGuard = "defined(__aarch64__)", TargetGuard = "sm4" in { def SM4E : SInst<"vsm4e", "...", "QUi">; def SM4EKEY : SInst<"vsm4ekey", "...", "QUi">; } @@ -1648,7 +1647,7 @@ } // ArchGuard = "defined(__aarch64__)" // ARMv8.2-A FP16 vector intrinsics for A32/A64. -let ArchGuard = "defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)" in { +let TargetGuard = "fullfp16" in { // ARMv8.2-A FP16 one-operand vector intrinsics. @@ -1673,7 +1672,7 @@ def VCVTP_U16 : SInst<"vcvtp_u16", "U.", "hQh">; // Vector rounding - let ArchGuard = "__ARM_ARCH >= 8 && defined(__ARM_FEATURE_DIRECTED_ROUNDING) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)" in { + let ArchGuard = "__ARM_ARCH >= 8 && defined(__ARM_FEATURE_DIRECTED_ROUNDING)", TargetGuard = "fullfp16" in { def FRINTZH : SInst<"vrnd", "..", "hQh">; def FRINTNH : SInst<"vrndn", "..", "hQh">; def FRINTAH : SInst<"vrnda", "..", "hQh">; @@ -1722,7 +1721,7 @@ // Max/Min def VMAXH : SInst<"vmax", "...", "hQh">; def VMINH : SInst<"vmin", "...", "hQh">; - let ArchGuard = "__ARM_ARCH >= 8 && defined(__ARM_FEATURE_NUMERIC_MAXMIN) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)" in { + let ArchGuard = "__ARM_ARCH >= 8 && defined(__ARM_FEATURE_NUMERIC_MAXMIN)", TargetGuard = "fullfp16" in { def FMAXNMH : SInst<"vmaxnm", "...", "hQh">; def FMINNMH : SInst<"vminnm", "...", "hQh">; } @@ -1772,7 +1771,7 @@ } // ARMv8.2-A FP16 vector intrinsics for A64 only. -let ArchGuard = "defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(__aarch64__)" in { +let ArchGuard = "defined(__aarch64__)", TargetGuard = "fullfp16" in { // Vector rounding def FRINTIH : SInst<"vrndi", "..", "hQh">; @@ -1867,11 +1866,11 @@ } // v8.2-A dot product instructions. -let ArchGuard = "defined(__ARM_FEATURE_DOTPROD)" in { +let TargetGuard = "dotprod" in { def DOT : SInst<"vdot", "..(<<)(<<)", "iQiUiQUi">; def DOT_LANE : SOpInst<"vdot_lane", "..(<<)(<; } -let ArchGuard = "defined(__ARM_FEATURE_DOTPROD) && defined(__aarch64__)" in { +let ArchGuard = "defined(__aarch64__)", TargetGuard = "dotprod" in { // Variants indexing into a 128-bit vector are A64 only. def UDOT_LANEQ : SOpInst<"vdot_laneq", "..(<<)(< { let isLaneQ = 1; @@ -1879,7 +1878,7 @@ } // v8.2-A FP16 fused multiply-add long instructions. -let ArchGuard = "defined(__ARM_FEATURE_FP16_FML) && defined(__aarch64__)" in { +let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp16fml" in { def VFMLAL_LOW : SInst<"vfmlal_low", ">>..", "hQh">; def VFMLSL_LOW : SInst<"vfmlsl_low", ">>..", "hQh">; def VFMLAL_HIGH : SInst<"vfmlal_high", ">>..", "hQh">; @@ -1904,7 +1903,7 @@ } } -let ArchGuard = "defined(__ARM_FEATURE_MATMUL_INT8)" in { +let TargetGuard = "i8mm" in { def VMMLA : SInst<"vmmla", "..(<<)(<<)", "QUiQi">; def VUSMMLA : SInst<"vusmmla", "..(<; @@ -1921,7 +1920,7 @@ } } -let ArchGuard = "defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC)" in { +let TargetGuard = "bf16" in { def VDOT_BF : SInst<"vbfdot", "..BB", "fQf">; def VDOT_LANE_BF : SOpInst<"vbfdot_lane", "..B(Bq)I", "fQf", OP_BFDOT_LN>; def VDOT_LANEQ_BF : SOpInst<"vbfdot_laneq", "..B(BQ)I", "fQf", OP_BFDOT_LNQ> { @@ -1965,7 +1964,7 @@ } // v8.3-A Vector complex addition intrinsics -let ArchGuard = "defined(__ARM_FEATURE_COMPLEX) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)" in { +let ArchGuard = "defined(__ARM_FEATURE_COMPLEX)", TargetGuard = "fullfp16" in { def VCADD_ROT90_FP16 : SInst<"vcadd_rot90", "...", "h">; def VCADD_ROT270_FP16 : SInst<"vcadd_rot270", "...", "h">; def VCADDQ_ROT90_FP16 : SInst<"vcaddq_rot90", "QQQ", "h">; @@ -1989,7 +1988,7 @@ } // V8.2-A BFloat intrinsics -let ArchGuard = "defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC)" in { +let TargetGuard = "bf16" in { def VCREATE_BF : NoTestOpInst<"vcreate", ".(IU>)", "b", OP_CAST> { let BigEndianSafe = 1; } @@ -2053,14 +2052,14 @@ def SCALAR_CVT_F32_BF16 : SOpInst<"vcvtah_f32", "(1F>)(1!)", "b", OP_CVT_F32_BF16>; } -let ArchGuard = "defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) && !defined(__aarch64__)" in { +let ArchGuard = "!defined(__aarch64__)", TargetGuard = "bf16" in { def VCVT_BF16_F32_A32_INTERNAL : WInst<"__a32_vcvt_bf16", "BQ", "f">; def VCVT_BF16_F32_A32 : SOpInst<"vcvt_bf16", "BQ", "f", OP_VCVT_BF16_F32_A32>; def VCVT_LOW_BF16_F32_A32 : SOpInst<"vcvt_low_bf16", "BQ", "Qf", OP_VCVT_BF16_F32_LO_A32>; def VCVT_HIGH_BF16_F32_A32 : SOpInst<"vcvt_high_bf16", "BBQ", "Qf", OP_VCVT_BF16_F32_HI_A32>; } -let ArchGuard = "defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) && defined(__aarch64__)" in { +let ArchGuard = "defined(__aarch64__)", TargetGuard = "bf16" in { def VCVT_LOW_BF16_F32_A64_INTERNAL : WInst<"__a64_vcvtq_low_bf16", "BQ", "Hf">; def VCVT_LOW_BF16_F32_A64 : SOpInst<"vcvt_low_bf16", "BQ", "Qf", OP_VCVT_BF16_F32_LO_A64>; def VCVT_HIGH_BF16_F32_A64 : SInst<"vcvt_high_bf16", "BBQ", "Qf">; @@ -2072,14 +2071,14 @@ def COPYQ_LANEQ_BF16 : IOpInst<"vcopy_laneq", "..I.I", "Qb", OP_COPY_LN>; } -let ArchGuard = "defined(__ARM_FEATURE_BF16) && !defined(__aarch64__)" in { +let ArchGuard = "!defined(__aarch64__)", TargetGuard = "bf16" in { let BigEndianSafe = 1 in { defm VREINTERPRET_BF : REINTERPRET_CROSS_TYPES< "csilUcUsUiUlhfPcPsPlQcQsQiQlQUcQUsQUiQUlQhQfQPcQPsQPl", "bQb">; } } -let ArchGuard = "defined(__ARM_FEATURE_BF16) && defined(__aarch64__)" in { +let ArchGuard = "defined(__aarch64__)", TargetGuard = "bf16" in { let BigEndianSafe = 1 in { defm VVREINTERPRET_BF : REINTERPRET_CROSS_TYPES< "csilUcUsUiUlhfdPcPsPlQcQsQiQlQUcQUsQUiQUlQhQfQdQPcQPsQPlQPk", "bQb">; Index: clang/include/clang/Basic/arm_neon_incl.td =================================================================== --- clang/include/clang/Basic/arm_neon_incl.td +++ clang/include/clang/Basic/arm_neon_incl.td @@ -265,6 +265,7 @@ string Prototype = p; string Types = t; string ArchGuard = ""; + string TargetGuard = ""; Operation Operation = o; bit BigEndianSafe = 0; Index: clang/lib/Basic/Targets/AArch64.cpp =================================================================== --- clang/lib/Basic/Targets/AArch64.cpp +++ clang/lib/Basic/Targets/AArch64.cpp @@ -25,6 +25,8 @@ const Builtin::Info AArch64TargetInfo::BuiltinInfo[] = { #define BUILTIN(ID, TYPE, ATTRS) \ {#ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, nullptr}, +#define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) \ + {#ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, FEATURE}, #include "clang/Basic/BuiltinsNEON.def" #define BUILTIN(ID, TYPE, ATTRS) \ Index: clang/lib/Basic/Targets/ARM.cpp =================================================================== --- clang/lib/Basic/Targets/ARM.cpp +++ clang/lib/Basic/Targets/ARM.cpp @@ -628,7 +628,8 @@ } bool ARMTargetInfo::hasBFloat16Type() const { - return HasBFloat16 && !SoftFloat; + // The __bf16 type is generally available so long as we have any fp registers. + return HasBFloat16 || (FPU && !SoftFloat); } bool ARMTargetInfo::isValidCPUName(StringRef Name) const { @@ -971,6 +972,8 @@ {#ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, nullptr}, #define LIBBUILTIN(ID, TYPE, ATTRS, HEADER) \ {#ID, TYPE, ATTRS, HEADER, ALL_LANGUAGES, nullptr}, +#define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) \ + {#ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, FEATURE}, #include "clang/Basic/BuiltinsNEON.def" #define BUILTIN(ID, TYPE, ATTRS) \ Index: clang/lib/CodeGen/CGBuiltin.cpp =================================================================== --- clang/lib/CodeGen/CGBuiltin.cpp +++ clang/lib/CodeGen/CGBuiltin.cpp @@ -5638,7 +5638,7 @@ TypeModifier } static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = { - NEONMAP1(__a32_vcvt_bf16_v, arm_neon_vcvtfp2bf, 0), + NEONMAP1(__a32_vcvt_bf16_f32, arm_neon_vcvtfp2bf, 0), NEONMAP0(splat_lane_v), NEONMAP0(splat_laneq_v), NEONMAP0(splatq_lane_v), @@ -5650,15 +5650,15 @@ NEONMAP0(vadd_v), NEONMAP0(vaddhn_v), NEONMAP0(vaddq_v), - NEONMAP1(vaesdq_v, arm_neon_aesd, 0), - NEONMAP1(vaeseq_v, arm_neon_aese, 0), - NEONMAP1(vaesimcq_v, arm_neon_aesimc, 0), - NEONMAP1(vaesmcq_v, arm_neon_aesmc, 0), - NEONMAP1(vbfdot_v, arm_neon_bfdot, 0), - NEONMAP1(vbfdotq_v, arm_neon_bfdot, 0), - NEONMAP1(vbfmlalbq_v, arm_neon_bfmlalb, 0), - NEONMAP1(vbfmlaltq_v, arm_neon_bfmlalt, 0), - NEONMAP1(vbfmmlaq_v, arm_neon_bfmmla, 0), + NEONMAP1(vaesdq_u8, arm_neon_aesd, 0), + NEONMAP1(vaeseq_u8, arm_neon_aese, 0), + NEONMAP1(vaesimcq_u8, arm_neon_aesimc, 0), + NEONMAP1(vaesmcq_u8, arm_neon_aesmc, 0), + NEONMAP1(vbfdot_f32, arm_neon_bfdot, 0), + NEONMAP1(vbfdotq_f32, arm_neon_bfdot, 0), + NEONMAP1(vbfmlalbq_f32, arm_neon_bfmlalb, 0), + NEONMAP1(vbfmlaltq_f32, arm_neon_bfmlalt, 0), + NEONMAP1(vbfmmlaq_f32, arm_neon_bfmmla, 0), NEONMAP1(vbsl_v, arm_neon_vbsl, AddRetType), NEONMAP1(vbslq_v, arm_neon_vbsl, AddRetType), NEONMAP1(vcadd_rot270_v, arm_neon_vcadd_rot270, Add1ArgType), @@ -5690,90 +5690,96 @@ NEONMAP1(vcnt_v, ctpop, Add1ArgType), NEONMAP1(vcntq_v, ctpop, Add1ArgType), NEONMAP1(vcvt_f16_f32, arm_neon_vcvtfp2hf, 0), - NEONMAP0(vcvt_f16_v), + NEONMAP0(vcvt_f16_s16), + NEONMAP0(vcvt_f16_u16), NEONMAP1(vcvt_f32_f16, arm_neon_vcvthf2fp, 0), NEONMAP0(vcvt_f32_v), - NEONMAP2(vcvt_n_f16_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0), + NEONMAP1(vcvt_n_f16_s16, arm_neon_vcvtfxs2fp, 0), + NEONMAP1(vcvt_n_f16_u16, arm_neon_vcvtfxu2fp, 0), NEONMAP2(vcvt_n_f32_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0), - NEONMAP1(vcvt_n_s16_v, arm_neon_vcvtfp2fxs, 0), + NEONMAP1(vcvt_n_s16_f16, arm_neon_vcvtfp2fxs, 0), NEONMAP1(vcvt_n_s32_v, arm_neon_vcvtfp2fxs, 0), NEONMAP1(vcvt_n_s64_v, arm_neon_vcvtfp2fxs, 0), - NEONMAP1(vcvt_n_u16_v, arm_neon_vcvtfp2fxu, 0), + NEONMAP1(vcvt_n_u16_f16, arm_neon_vcvtfp2fxu, 0), NEONMAP1(vcvt_n_u32_v, arm_neon_vcvtfp2fxu, 0), NEONMAP1(vcvt_n_u64_v, arm_neon_vcvtfp2fxu, 0), - NEONMAP0(vcvt_s16_v), + NEONMAP0(vcvt_s16_f16), NEONMAP0(vcvt_s32_v), NEONMAP0(vcvt_s64_v), - NEONMAP0(vcvt_u16_v), + NEONMAP0(vcvt_u16_f16), NEONMAP0(vcvt_u32_v), NEONMAP0(vcvt_u64_v), - NEONMAP1(vcvta_s16_v, arm_neon_vcvtas, 0), + NEONMAP1(vcvta_s16_f16, arm_neon_vcvtas, 0), NEONMAP1(vcvta_s32_v, arm_neon_vcvtas, 0), NEONMAP1(vcvta_s64_v, arm_neon_vcvtas, 0), - NEONMAP1(vcvta_u16_v, arm_neon_vcvtau, 0), + NEONMAP1(vcvta_u16_f16, arm_neon_vcvtau, 0), NEONMAP1(vcvta_u32_v, arm_neon_vcvtau, 0), NEONMAP1(vcvta_u64_v, arm_neon_vcvtau, 0), - NEONMAP1(vcvtaq_s16_v, arm_neon_vcvtas, 0), + NEONMAP1(vcvtaq_s16_f16, arm_neon_vcvtas, 0), NEONMAP1(vcvtaq_s32_v, arm_neon_vcvtas, 0), NEONMAP1(vcvtaq_s64_v, arm_neon_vcvtas, 0), - NEONMAP1(vcvtaq_u16_v, arm_neon_vcvtau, 0), + NEONMAP1(vcvtaq_u16_f16, arm_neon_vcvtau, 0), NEONMAP1(vcvtaq_u32_v, arm_neon_vcvtau, 0), NEONMAP1(vcvtaq_u64_v, arm_neon_vcvtau, 0), NEONMAP1(vcvth_bf16_f32, arm_neon_vcvtbfp2bf, 0), - NEONMAP1(vcvtm_s16_v, arm_neon_vcvtms, 0), + NEONMAP1(vcvtm_s16_f16, arm_neon_vcvtms, 0), NEONMAP1(vcvtm_s32_v, arm_neon_vcvtms, 0), NEONMAP1(vcvtm_s64_v, arm_neon_vcvtms, 0), - NEONMAP1(vcvtm_u16_v, arm_neon_vcvtmu, 0), + NEONMAP1(vcvtm_u16_f16, arm_neon_vcvtmu, 0), NEONMAP1(vcvtm_u32_v, arm_neon_vcvtmu, 0), NEONMAP1(vcvtm_u64_v, arm_neon_vcvtmu, 0), - NEONMAP1(vcvtmq_s16_v, arm_neon_vcvtms, 0), + NEONMAP1(vcvtmq_s16_f16, arm_neon_vcvtms, 0), NEONMAP1(vcvtmq_s32_v, arm_neon_vcvtms, 0), NEONMAP1(vcvtmq_s64_v, arm_neon_vcvtms, 0), - NEONMAP1(vcvtmq_u16_v, arm_neon_vcvtmu, 0), + NEONMAP1(vcvtmq_u16_f16, arm_neon_vcvtmu, 0), NEONMAP1(vcvtmq_u32_v, arm_neon_vcvtmu, 0), NEONMAP1(vcvtmq_u64_v, arm_neon_vcvtmu, 0), - NEONMAP1(vcvtn_s16_v, arm_neon_vcvtns, 0), + NEONMAP1(vcvtn_s16_f16, arm_neon_vcvtns, 0), NEONMAP1(vcvtn_s32_v, arm_neon_vcvtns, 0), NEONMAP1(vcvtn_s64_v, arm_neon_vcvtns, 0), - NEONMAP1(vcvtn_u16_v, arm_neon_vcvtnu, 0), + NEONMAP1(vcvtn_u16_f16, arm_neon_vcvtnu, 0), NEONMAP1(vcvtn_u32_v, arm_neon_vcvtnu, 0), NEONMAP1(vcvtn_u64_v, arm_neon_vcvtnu, 0), - NEONMAP1(vcvtnq_s16_v, arm_neon_vcvtns, 0), + NEONMAP1(vcvtnq_s16_f16, arm_neon_vcvtns, 0), NEONMAP1(vcvtnq_s32_v, arm_neon_vcvtns, 0), NEONMAP1(vcvtnq_s64_v, arm_neon_vcvtns, 0), - NEONMAP1(vcvtnq_u16_v, arm_neon_vcvtnu, 0), + NEONMAP1(vcvtnq_u16_f16, arm_neon_vcvtnu, 0), NEONMAP1(vcvtnq_u32_v, arm_neon_vcvtnu, 0), NEONMAP1(vcvtnq_u64_v, arm_neon_vcvtnu, 0), - NEONMAP1(vcvtp_s16_v, arm_neon_vcvtps, 0), + NEONMAP1(vcvtp_s16_f16, arm_neon_vcvtps, 0), NEONMAP1(vcvtp_s32_v, arm_neon_vcvtps, 0), NEONMAP1(vcvtp_s64_v, arm_neon_vcvtps, 0), - NEONMAP1(vcvtp_u16_v, arm_neon_vcvtpu, 0), + NEONMAP1(vcvtp_u16_f16, arm_neon_vcvtpu, 0), NEONMAP1(vcvtp_u32_v, arm_neon_vcvtpu, 0), NEONMAP1(vcvtp_u64_v, arm_neon_vcvtpu, 0), - NEONMAP1(vcvtpq_s16_v, arm_neon_vcvtps, 0), + NEONMAP1(vcvtpq_s16_f16, arm_neon_vcvtps, 0), NEONMAP1(vcvtpq_s32_v, arm_neon_vcvtps, 0), NEONMAP1(vcvtpq_s64_v, arm_neon_vcvtps, 0), - NEONMAP1(vcvtpq_u16_v, arm_neon_vcvtpu, 0), + NEONMAP1(vcvtpq_u16_f16, arm_neon_vcvtpu, 0), NEONMAP1(vcvtpq_u32_v, arm_neon_vcvtpu, 0), NEONMAP1(vcvtpq_u64_v, arm_neon_vcvtpu, 0), - NEONMAP0(vcvtq_f16_v), + NEONMAP0(vcvtq_f16_s16), + NEONMAP0(vcvtq_f16_u16), NEONMAP0(vcvtq_f32_v), - NEONMAP2(vcvtq_n_f16_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0), + NEONMAP1(vcvtq_n_f16_s16, arm_neon_vcvtfxs2fp, 0), + NEONMAP1(vcvtq_n_f16_u16, arm_neon_vcvtfxu2fp, 0), NEONMAP2(vcvtq_n_f32_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0), - NEONMAP1(vcvtq_n_s16_v, arm_neon_vcvtfp2fxs, 0), + NEONMAP1(vcvtq_n_s16_f16, arm_neon_vcvtfp2fxs, 0), NEONMAP1(vcvtq_n_s32_v, arm_neon_vcvtfp2fxs, 0), NEONMAP1(vcvtq_n_s64_v, arm_neon_vcvtfp2fxs, 0), - NEONMAP1(vcvtq_n_u16_v, arm_neon_vcvtfp2fxu, 0), + NEONMAP1(vcvtq_n_u16_f16, arm_neon_vcvtfp2fxu, 0), NEONMAP1(vcvtq_n_u32_v, arm_neon_vcvtfp2fxu, 0), NEONMAP1(vcvtq_n_u64_v, arm_neon_vcvtfp2fxu, 0), - NEONMAP0(vcvtq_s16_v), + NEONMAP0(vcvtq_s16_f16), NEONMAP0(vcvtq_s32_v), NEONMAP0(vcvtq_s64_v), - NEONMAP0(vcvtq_u16_v), + NEONMAP0(vcvtq_u16_f16), NEONMAP0(vcvtq_u32_v), NEONMAP0(vcvtq_u64_v), - NEONMAP2(vdot_v, arm_neon_udot, arm_neon_sdot, 0), - NEONMAP2(vdotq_v, arm_neon_udot, arm_neon_sdot, 0), + NEONMAP1(vdot_s32, arm_neon_sdot, 0), + NEONMAP1(vdot_u32, arm_neon_udot, 0), + NEONMAP1(vdotq_s32, arm_neon_sdot, 0), + NEONMAP1(vdotq_u32, arm_neon_udot, 0), NEONMAP0(vext_v), NEONMAP0(vextq_v), NEONMAP0(vfma_v), @@ -5818,7 +5824,8 @@ NEONMAP1(vminnm_v, arm_neon_vminnm, Add1ArgType), NEONMAP1(vminnmq_v, arm_neon_vminnm, Add1ArgType), NEONMAP2(vminq_v, arm_neon_vminu, arm_neon_vmins, Add1ArgType | UnsignedAlts), - NEONMAP2(vmmlaq_v, arm_neon_ummla, arm_neon_smmla, 0), + NEONMAP1(vmmlaq_s32, arm_neon_smmla, 0), + NEONMAP1(vmmlaq_u32, arm_neon_ummla, 0), NEONMAP0(vmovl_v), NEONMAP0(vmovn_v), NEONMAP1(vmul_v, arm_neon_vmulp, Add1ArgType), @@ -5891,12 +5898,12 @@ NEONMAP1(vrsqrts_v, arm_neon_vrsqrts, Add1ArgType), NEONMAP1(vrsqrtsq_v, arm_neon_vrsqrts, Add1ArgType), NEONMAP1(vrsubhn_v, arm_neon_vrsubhn, Add1ArgType), - NEONMAP1(vsha1su0q_v, arm_neon_sha1su0, 0), - NEONMAP1(vsha1su1q_v, arm_neon_sha1su1, 0), - NEONMAP1(vsha256h2q_v, arm_neon_sha256h2, 0), - NEONMAP1(vsha256hq_v, arm_neon_sha256h, 0), - NEONMAP1(vsha256su0q_v, arm_neon_sha256su0, 0), - NEONMAP1(vsha256su1q_v, arm_neon_sha256su1, 0), + NEONMAP1(vsha1su0q_u32, arm_neon_sha1su0, 0), + NEONMAP1(vsha1su1q_u32, arm_neon_sha1su1, 0), + NEONMAP1(vsha256h2q_u32, arm_neon_sha256h2, 0), + NEONMAP1(vsha256hq_u32, arm_neon_sha256h, 0), + NEONMAP1(vsha256su0q_u32, arm_neon_sha256su0, 0), + NEONMAP1(vsha256su1q_u32, arm_neon_sha256su1, 0), NEONMAP0(vshl_n_v), NEONMAP2(vshl_v, arm_neon_vshiftu, arm_neon_vshifts, Add1ArgType | UnsignedAlts), NEONMAP0(vshll_n_v), @@ -5930,9 +5937,9 @@ NEONMAP0(vtrnq_v), NEONMAP0(vtst_v), NEONMAP0(vtstq_v), - NEONMAP1(vusdot_v, arm_neon_usdot, 0), - NEONMAP1(vusdotq_v, arm_neon_usdot, 0), - NEONMAP1(vusmmlaq_v, arm_neon_usmmla, 0), + NEONMAP1(vusdot_s32, arm_neon_usdot, 0), + NEONMAP1(vusdotq_s32, arm_neon_usdot, 0), + NEONMAP1(vusmmlaq_s32, arm_neon_usmmla, 0), NEONMAP0(vuzp_v), NEONMAP0(vuzpq_v), NEONMAP0(vzip_v), @@ -5940,7 +5947,7 @@ }; static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = { - NEONMAP1(__a64_vcvtq_low_bf16_v, aarch64_neon_bfcvtn, 0), + NEONMAP1(__a64_vcvtq_low_bf16_f32, aarch64_neon_bfcvtn, 0), NEONMAP0(splat_lane_v), NEONMAP0(splat_laneq_v), NEONMAP0(splatq_lane_v), @@ -5951,16 +5958,23 @@ NEONMAP0(vaddhn_v), NEONMAP0(vaddq_p128), NEONMAP0(vaddq_v), - NEONMAP1(vaesdq_v, aarch64_crypto_aesd, 0), - NEONMAP1(vaeseq_v, aarch64_crypto_aese, 0), - NEONMAP1(vaesimcq_v, aarch64_crypto_aesimc, 0), - NEONMAP1(vaesmcq_v, aarch64_crypto_aesmc, 0), - NEONMAP2(vbcaxq_v, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts), - NEONMAP1(vbfdot_v, aarch64_neon_bfdot, 0), - NEONMAP1(vbfdotq_v, aarch64_neon_bfdot, 0), - NEONMAP1(vbfmlalbq_v, aarch64_neon_bfmlalb, 0), - NEONMAP1(vbfmlaltq_v, aarch64_neon_bfmlalt, 0), - NEONMAP1(vbfmmlaq_v, aarch64_neon_bfmmla, 0), + NEONMAP1(vaesdq_u8, aarch64_crypto_aesd, 0), + NEONMAP1(vaeseq_u8, aarch64_crypto_aese, 0), + NEONMAP1(vaesimcq_u8, aarch64_crypto_aesimc, 0), + NEONMAP1(vaesmcq_u8, aarch64_crypto_aesmc, 0), + NEONMAP2(vbcaxq_s16, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts), + NEONMAP2(vbcaxq_s32, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts), + NEONMAP2(vbcaxq_s64, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts), + NEONMAP2(vbcaxq_s8, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts), + NEONMAP2(vbcaxq_u16, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts), + NEONMAP2(vbcaxq_u32, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts), + NEONMAP2(vbcaxq_u64, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts), + NEONMAP2(vbcaxq_u8, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts), + NEONMAP1(vbfdot_f32, aarch64_neon_bfdot, 0), + NEONMAP1(vbfdotq_f32, aarch64_neon_bfdot, 0), + NEONMAP1(vbfmlalbq_f32, aarch64_neon_bfmlalb, 0), + NEONMAP1(vbfmlaltq_f32, aarch64_neon_bfmlalt, 0), + NEONMAP1(vbfmmlaq_f32, aarch64_neon_bfmmla, 0), NEONMAP1(vcadd_rot270_v, aarch64_neon_vcadd_rot270, Add1ArgType), NEONMAP1(vcadd_rot90_v, aarch64_neon_vcadd_rot90, Add1ArgType), NEONMAP1(vcaddq_rot270_v, aarch64_neon_vcadd_rot270, Add1ArgType), @@ -5998,46 +6012,59 @@ NEONMAP1(vcnt_v, ctpop, Add1ArgType), NEONMAP1(vcntq_v, ctpop, Add1ArgType), NEONMAP1(vcvt_f16_f32, aarch64_neon_vcvtfp2hf, 0), - NEONMAP0(vcvt_f16_v), + NEONMAP0(vcvt_f16_s16), + NEONMAP0(vcvt_f16_u16), NEONMAP1(vcvt_f32_f16, aarch64_neon_vcvthf2fp, 0), NEONMAP0(vcvt_f32_v), - NEONMAP2(vcvt_n_f16_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0), + NEONMAP1(vcvt_n_f16_s16, aarch64_neon_vcvtfxs2fp, 0), + NEONMAP1(vcvt_n_f16_u16, aarch64_neon_vcvtfxu2fp, 0), NEONMAP2(vcvt_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0), NEONMAP2(vcvt_n_f64_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0), - NEONMAP1(vcvt_n_s16_v, aarch64_neon_vcvtfp2fxs, 0), + NEONMAP1(vcvt_n_s16_f16, aarch64_neon_vcvtfp2fxs, 0), NEONMAP1(vcvt_n_s32_v, aarch64_neon_vcvtfp2fxs, 0), NEONMAP1(vcvt_n_s64_v, aarch64_neon_vcvtfp2fxs, 0), - NEONMAP1(vcvt_n_u16_v, aarch64_neon_vcvtfp2fxu, 0), + NEONMAP1(vcvt_n_u16_f16, aarch64_neon_vcvtfp2fxu, 0), NEONMAP1(vcvt_n_u32_v, aarch64_neon_vcvtfp2fxu, 0), NEONMAP1(vcvt_n_u64_v, aarch64_neon_vcvtfp2fxu, 0), - NEONMAP0(vcvtq_f16_v), + NEONMAP0(vcvtq_f16_s16), + NEONMAP0(vcvtq_f16_u16), NEONMAP0(vcvtq_f32_v), - NEONMAP1(vcvtq_high_bf16_v, aarch64_neon_bfcvtn2, 0), - NEONMAP2(vcvtq_n_f16_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0), + NEONMAP1(vcvtq_high_bf16_f32, aarch64_neon_bfcvtn2, 0), + NEONMAP1(vcvtq_n_f16_s16, aarch64_neon_vcvtfxs2fp, 0), + NEONMAP1(vcvtq_n_f16_u16, aarch64_neon_vcvtfxu2fp, 0), NEONMAP2(vcvtq_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0), NEONMAP2(vcvtq_n_f64_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0), - NEONMAP1(vcvtq_n_s16_v, aarch64_neon_vcvtfp2fxs, 0), + NEONMAP1(vcvtq_n_s16_f16, aarch64_neon_vcvtfp2fxs, 0), NEONMAP1(vcvtq_n_s32_v, aarch64_neon_vcvtfp2fxs, 0), NEONMAP1(vcvtq_n_s64_v, aarch64_neon_vcvtfp2fxs, 0), - NEONMAP1(vcvtq_n_u16_v, aarch64_neon_vcvtfp2fxu, 0), + NEONMAP1(vcvtq_n_u16_f16, aarch64_neon_vcvtfp2fxu, 0), NEONMAP1(vcvtq_n_u32_v, aarch64_neon_vcvtfp2fxu, 0), NEONMAP1(vcvtq_n_u64_v, aarch64_neon_vcvtfp2fxu, 0), NEONMAP1(vcvtx_f32_v, aarch64_neon_fcvtxn, AddRetType | Add1ArgType), - NEONMAP2(vdot_v, aarch64_neon_udot, aarch64_neon_sdot, 0), - NEONMAP2(vdotq_v, aarch64_neon_udot, aarch64_neon_sdot, 0), - NEONMAP2(veor3q_v, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts), + NEONMAP1(vdot_s32, aarch64_neon_sdot, 0), + NEONMAP1(vdot_u32, aarch64_neon_udot, 0), + NEONMAP1(vdotq_s32, aarch64_neon_sdot, 0), + NEONMAP1(vdotq_u32, aarch64_neon_udot, 0), + NEONMAP2(veor3q_s16, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts), + NEONMAP2(veor3q_s32, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts), + NEONMAP2(veor3q_s64, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts), + NEONMAP2(veor3q_s8, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts), + NEONMAP2(veor3q_u16, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts), + NEONMAP2(veor3q_u32, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts), + NEONMAP2(veor3q_u64, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts), + NEONMAP2(veor3q_u8, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts), NEONMAP0(vext_v), NEONMAP0(vextq_v), NEONMAP0(vfma_v), NEONMAP0(vfmaq_v), - NEONMAP1(vfmlal_high_v, aarch64_neon_fmlal2, 0), - NEONMAP1(vfmlal_low_v, aarch64_neon_fmlal, 0), - NEONMAP1(vfmlalq_high_v, aarch64_neon_fmlal2, 0), - NEONMAP1(vfmlalq_low_v, aarch64_neon_fmlal, 0), - NEONMAP1(vfmlsl_high_v, aarch64_neon_fmlsl2, 0), - NEONMAP1(vfmlsl_low_v, aarch64_neon_fmlsl, 0), - NEONMAP1(vfmlslq_high_v, aarch64_neon_fmlsl2, 0), - NEONMAP1(vfmlslq_low_v, aarch64_neon_fmlsl, 0), + NEONMAP1(vfmlal_high_f16, aarch64_neon_fmlal2, 0), + NEONMAP1(vfmlal_low_f16, aarch64_neon_fmlal, 0), + NEONMAP1(vfmlalq_high_f16, aarch64_neon_fmlal2, 0), + NEONMAP1(vfmlalq_low_f16, aarch64_neon_fmlal, 0), + NEONMAP1(vfmlsl_high_f16, aarch64_neon_fmlsl2, 0), + NEONMAP1(vfmlsl_low_f16, aarch64_neon_fmlsl, 0), + NEONMAP1(vfmlslq_high_f16, aarch64_neon_fmlsl2, 0), + NEONMAP1(vfmlslq_low_f16, aarch64_neon_fmlsl, 0), NEONMAP2(vhadd_v, aarch64_neon_uhadd, aarch64_neon_shadd, Add1ArgType | UnsignedAlts), NEONMAP2(vhaddq_v, aarch64_neon_uhadd, aarch64_neon_shadd, Add1ArgType | UnsignedAlts), NEONMAP2(vhsub_v, aarch64_neon_uhsub, aarch64_neon_shsub, Add1ArgType | UnsignedAlts), @@ -6048,7 +6075,8 @@ NEONMAP1(vld1q_x2_v, aarch64_neon_ld1x2, 0), NEONMAP1(vld1q_x3_v, aarch64_neon_ld1x3, 0), NEONMAP1(vld1q_x4_v, aarch64_neon_ld1x4, 0), - NEONMAP2(vmmlaq_v, aarch64_neon_ummla, aarch64_neon_smmla, 0), + NEONMAP1(vmmlaq_s32, aarch64_neon_smmla, 0), + NEONMAP1(vmmlaq_u32, aarch64_neon_ummla, 0), NEONMAP0(vmovl_v), NEONMAP0(vmovn_v), NEONMAP1(vmul_v, aarch64_neon_pmul, Add1ArgType), @@ -6095,7 +6123,7 @@ NEONMAP2(vqsub_v, aarch64_neon_uqsub, aarch64_neon_sqsub, Add1ArgType | UnsignedAlts), NEONMAP2(vqsubq_v, aarch64_neon_uqsub, aarch64_neon_sqsub, Add1ArgType | UnsignedAlts), NEONMAP1(vraddhn_v, aarch64_neon_raddhn, Add1ArgType), - NEONMAP1(vrax1q_v, aarch64_crypto_rax1, 0), + NEONMAP1(vrax1q_u64, aarch64_crypto_rax1, 0), NEONMAP2(vrecpe_v, aarch64_neon_frecpe, aarch64_neon_urecpe, 0), NEONMAP2(vrecpeq_v, aarch64_neon_frecpe, aarch64_neon_urecpe, 0), NEONMAP1(vrecps_v, aarch64_neon_frecps, Add1ArgType), @@ -6121,16 +6149,16 @@ NEONMAP1(vrsqrts_v, aarch64_neon_frsqrts, Add1ArgType), NEONMAP1(vrsqrtsq_v, aarch64_neon_frsqrts, Add1ArgType), NEONMAP1(vrsubhn_v, aarch64_neon_rsubhn, Add1ArgType), - NEONMAP1(vsha1su0q_v, aarch64_crypto_sha1su0, 0), - NEONMAP1(vsha1su1q_v, aarch64_crypto_sha1su1, 0), - NEONMAP1(vsha256h2q_v, aarch64_crypto_sha256h2, 0), - NEONMAP1(vsha256hq_v, aarch64_crypto_sha256h, 0), - NEONMAP1(vsha256su0q_v, aarch64_crypto_sha256su0, 0), - NEONMAP1(vsha256su1q_v, aarch64_crypto_sha256su1, 0), - NEONMAP1(vsha512h2q_v, aarch64_crypto_sha512h2, 0), - NEONMAP1(vsha512hq_v, aarch64_crypto_sha512h, 0), - NEONMAP1(vsha512su0q_v, aarch64_crypto_sha512su0, 0), - NEONMAP1(vsha512su1q_v, aarch64_crypto_sha512su1, 0), + NEONMAP1(vsha1su0q_u32, aarch64_crypto_sha1su0, 0), + NEONMAP1(vsha1su1q_u32, aarch64_crypto_sha1su1, 0), + NEONMAP1(vsha256h2q_u32, aarch64_crypto_sha256h2, 0), + NEONMAP1(vsha256hq_u32, aarch64_crypto_sha256h, 0), + NEONMAP1(vsha256su0q_u32, aarch64_crypto_sha256su0, 0), + NEONMAP1(vsha256su1q_u32, aarch64_crypto_sha256su1, 0), + NEONMAP1(vsha512h2q_u64, aarch64_crypto_sha512h2, 0), + NEONMAP1(vsha512hq_u64, aarch64_crypto_sha512h, 0), + NEONMAP1(vsha512su0q_u64, aarch64_crypto_sha512su0, 0), + NEONMAP1(vsha512su1q_u64, aarch64_crypto_sha512su1, 0), NEONMAP0(vshl_n_v), NEONMAP2(vshl_v, aarch64_neon_ushl, aarch64_neon_sshl, Add1ArgType | UnsignedAlts), NEONMAP0(vshll_n_v), @@ -6139,15 +6167,15 @@ NEONMAP0(vshr_n_v), NEONMAP0(vshrn_n_v), NEONMAP0(vshrq_n_v), - NEONMAP1(vsm3partw1q_v, aarch64_crypto_sm3partw1, 0), - NEONMAP1(vsm3partw2q_v, aarch64_crypto_sm3partw2, 0), - NEONMAP1(vsm3ss1q_v, aarch64_crypto_sm3ss1, 0), - NEONMAP1(vsm3tt1aq_v, aarch64_crypto_sm3tt1a, 0), - NEONMAP1(vsm3tt1bq_v, aarch64_crypto_sm3tt1b, 0), - NEONMAP1(vsm3tt2aq_v, aarch64_crypto_sm3tt2a, 0), - NEONMAP1(vsm3tt2bq_v, aarch64_crypto_sm3tt2b, 0), - NEONMAP1(vsm4ekeyq_v, aarch64_crypto_sm4ekey, 0), - NEONMAP1(vsm4eq_v, aarch64_crypto_sm4e, 0), + NEONMAP1(vsm3partw1q_u32, aarch64_crypto_sm3partw1, 0), + NEONMAP1(vsm3partw2q_u32, aarch64_crypto_sm3partw2, 0), + NEONMAP1(vsm3ss1q_u32, aarch64_crypto_sm3ss1, 0), + NEONMAP1(vsm3tt1aq_u32, aarch64_crypto_sm3tt1a, 0), + NEONMAP1(vsm3tt1bq_u32, aarch64_crypto_sm3tt1b, 0), + NEONMAP1(vsm3tt2aq_u32, aarch64_crypto_sm3tt2a, 0), + NEONMAP1(vsm3tt2bq_u32, aarch64_crypto_sm3tt2b, 0), + NEONMAP1(vsm4ekeyq_u32, aarch64_crypto_sm4ekey, 0), + NEONMAP1(vsm4eq_u32, aarch64_crypto_sm4e, 0), NEONMAP1(vst1_x2_v, aarch64_neon_st1x2, 0), NEONMAP1(vst1_x3_v, aarch64_neon_st1x3, 0), NEONMAP1(vst1_x4_v, aarch64_neon_st1x4, 0), @@ -6157,10 +6185,10 @@ NEONMAP0(vsubhn_v), NEONMAP0(vtst_v), NEONMAP0(vtstq_v), - NEONMAP1(vusdot_v, aarch64_neon_usdot, 0), - NEONMAP1(vusdotq_v, aarch64_neon_usdot, 0), - NEONMAP1(vusmmlaq_v, aarch64_neon_usmmla, 0), - NEONMAP1(vxarq_v, aarch64_crypto_xar, 0), + NEONMAP1(vusdot_s32, aarch64_neon_usdot, 0), + NEONMAP1(vusdotq_s32, aarch64_neon_usdot, 0), + NEONMAP1(vusmmlaq_s32, aarch64_neon_usmmla, 0), + NEONMAP1(vxarq_u64, aarch64_crypto_xar, 0), }; static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = { @@ -6402,6 +6430,160 @@ NEONMAP1(vrsqrtsh_f16, aarch64_neon_frsqrts, Add1ArgType), }; +// Some intrinsics are equivalent for codegen. +static const std::pair NEONEquivalentIntrinsicMap[] = { + { NEON::BI__builtin_neon_splat_lane_bf16, NEON::BI__builtin_neon_splat_lane_v, }, + { NEON::BI__builtin_neon_splat_laneq_bf16, NEON::BI__builtin_neon_splat_laneq_v, }, + { NEON::BI__builtin_neon_splatq_lane_bf16, NEON::BI__builtin_neon_splatq_lane_v, }, + { NEON::BI__builtin_neon_splatq_laneq_bf16, NEON::BI__builtin_neon_splatq_laneq_v, }, + { NEON::BI__builtin_neon_vabd_f16, NEON::BI__builtin_neon_vabd_v, }, + { NEON::BI__builtin_neon_vabdq_f16, NEON::BI__builtin_neon_vabdq_v, }, + { NEON::BI__builtin_neon_vabs_f16, NEON::BI__builtin_neon_vabs_v, }, + { NEON::BI__builtin_neon_vabsq_f16, NEON::BI__builtin_neon_vabsq_v, }, + { NEON::BI__builtin_neon_vbsl_f16, NEON::BI__builtin_neon_vbsl_v, }, + { NEON::BI__builtin_neon_vbslq_f16, NEON::BI__builtin_neon_vbslq_v, }, + { NEON::BI__builtin_neon_vcadd_rot270_f16, NEON::BI__builtin_neon_vcadd_rot270_v, }, + { NEON::BI__builtin_neon_vcadd_rot90_f16, NEON::BI__builtin_neon_vcadd_rot90_v, }, + { NEON::BI__builtin_neon_vcaddq_rot270_f16, NEON::BI__builtin_neon_vcaddq_rot270_v, }, + { NEON::BI__builtin_neon_vcaddq_rot90_f16, NEON::BI__builtin_neon_vcaddq_rot90_v, }, + { NEON::BI__builtin_neon_vcage_f16, NEON::BI__builtin_neon_vcage_v, }, + { NEON::BI__builtin_neon_vcageq_f16, NEON::BI__builtin_neon_vcageq_v, }, + { NEON::BI__builtin_neon_vcagt_f16, NEON::BI__builtin_neon_vcagt_v, }, + { NEON::BI__builtin_neon_vcagtq_f16, NEON::BI__builtin_neon_vcagtq_v, }, + { NEON::BI__builtin_neon_vcale_f16, NEON::BI__builtin_neon_vcale_v, }, + { NEON::BI__builtin_neon_vcaleq_f16, NEON::BI__builtin_neon_vcaleq_v, }, + { NEON::BI__builtin_neon_vcalt_f16, NEON::BI__builtin_neon_vcalt_v, }, + { NEON::BI__builtin_neon_vcaltq_f16, NEON::BI__builtin_neon_vcaltq_v, }, + { NEON::BI__builtin_neon_vceqz_f16, NEON::BI__builtin_neon_vceqz_v, }, + { NEON::BI__builtin_neon_vceqzq_f16, NEON::BI__builtin_neon_vceqzq_v, }, + { NEON::BI__builtin_neon_vcgez_f16, NEON::BI__builtin_neon_vcgez_v, }, + { NEON::BI__builtin_neon_vcgezq_f16, NEON::BI__builtin_neon_vcgezq_v, }, + { NEON::BI__builtin_neon_vcgtz_f16, NEON::BI__builtin_neon_vcgtz_v, }, + { NEON::BI__builtin_neon_vcgtzq_f16, NEON::BI__builtin_neon_vcgtzq_v, }, + { NEON::BI__builtin_neon_vclez_f16, NEON::BI__builtin_neon_vclez_v, }, + { NEON::BI__builtin_neon_vclezq_f16, NEON::BI__builtin_neon_vclezq_v, }, + { NEON::BI__builtin_neon_vcltz_f16, NEON::BI__builtin_neon_vcltz_v, }, + { NEON::BI__builtin_neon_vcltzq_f16, NEON::BI__builtin_neon_vcltzq_v, }, + { NEON::BI__builtin_neon_vcmla_f16, NEON::BI__builtin_neon_vcmla_v, }, + { NEON::BI__builtin_neon_vcmla_rot180_f16, NEON::BI__builtin_neon_vcmla_rot180_v, }, + { NEON::BI__builtin_neon_vcmla_rot270_f16, NEON::BI__builtin_neon_vcmla_rot270_v, }, + { NEON::BI__builtin_neon_vcmla_rot90_f16, NEON::BI__builtin_neon_vcmla_rot90_v, }, + { NEON::BI__builtin_neon_vcmlaq_f16, NEON::BI__builtin_neon_vcmlaq_v, }, + { NEON::BI__builtin_neon_vcmlaq_rot180_f16, NEON::BI__builtin_neon_vcmlaq_rot180_v, }, + { NEON::BI__builtin_neon_vcmlaq_rot270_f16, NEON::BI__builtin_neon_vcmlaq_rot270_v, }, + { NEON::BI__builtin_neon_vcmlaq_rot90_f16, NEON::BI__builtin_neon_vcmlaq_rot90_v, }, + { NEON::BI__builtin_neon_vext_f16, NEON::BI__builtin_neon_vext_v, }, + { NEON::BI__builtin_neon_vextq_f16, NEON::BI__builtin_neon_vextq_v, }, + { NEON::BI__builtin_neon_vfma_f16, NEON::BI__builtin_neon_vfma_v, }, + { NEON::BI__builtin_neon_vfma_lane_f16, NEON::BI__builtin_neon_vfma_lane_v, }, + { NEON::BI__builtin_neon_vfma_laneq_f16, NEON::BI__builtin_neon_vfma_laneq_v, }, + { NEON::BI__builtin_neon_vfmaq_f16, NEON::BI__builtin_neon_vfmaq_v, }, + { NEON::BI__builtin_neon_vfmaq_lane_f16, NEON::BI__builtin_neon_vfmaq_lane_v, }, + { NEON::BI__builtin_neon_vfmaq_laneq_f16, NEON::BI__builtin_neon_vfmaq_laneq_v, }, + { NEON::BI__builtin_neon_vld1_bf16_x2, NEON::BI__builtin_neon_vld1_x2_v }, + { NEON::BI__builtin_neon_vld1_bf16_x3, NEON::BI__builtin_neon_vld1_x3_v }, + { NEON::BI__builtin_neon_vld1_bf16_x4, NEON::BI__builtin_neon_vld1_x4_v }, + { NEON::BI__builtin_neon_vld1_bf16, NEON::BI__builtin_neon_vld1_v }, + { NEON::BI__builtin_neon_vld1_dup_bf16, NEON::BI__builtin_neon_vld1_dup_v }, + { NEON::BI__builtin_neon_vld1_lane_bf16, NEON::BI__builtin_neon_vld1_lane_v }, + { NEON::BI__builtin_neon_vld1q_bf16_x2, NEON::BI__builtin_neon_vld1q_x2_v }, + { NEON::BI__builtin_neon_vld1q_bf16_x3, NEON::BI__builtin_neon_vld1q_x3_v }, + { NEON::BI__builtin_neon_vld1q_bf16_x4, NEON::BI__builtin_neon_vld1q_x4_v }, + { NEON::BI__builtin_neon_vld1q_bf16, NEON::BI__builtin_neon_vld1q_v }, + { NEON::BI__builtin_neon_vld1q_dup_bf16, NEON::BI__builtin_neon_vld1q_dup_v }, + { NEON::BI__builtin_neon_vld1q_lane_bf16, NEON::BI__builtin_neon_vld1q_lane_v }, + { NEON::BI__builtin_neon_vld2_bf16, NEON::BI__builtin_neon_vld2_v }, + { NEON::BI__builtin_neon_vld2_dup_bf16, NEON::BI__builtin_neon_vld2_dup_v }, + { NEON::BI__builtin_neon_vld2_lane_bf16, NEON::BI__builtin_neon_vld2_lane_v }, + { NEON::BI__builtin_neon_vld2q_bf16, NEON::BI__builtin_neon_vld2q_v }, + { NEON::BI__builtin_neon_vld2q_dup_bf16, NEON::BI__builtin_neon_vld2q_dup_v }, + { NEON::BI__builtin_neon_vld2q_lane_bf16, NEON::BI__builtin_neon_vld2q_lane_v }, + { NEON::BI__builtin_neon_vld3_bf16, NEON::BI__builtin_neon_vld3_v }, + { NEON::BI__builtin_neon_vld3_dup_bf16, NEON::BI__builtin_neon_vld3_dup_v }, + { NEON::BI__builtin_neon_vld3_lane_bf16, NEON::BI__builtin_neon_vld3_lane_v }, + { NEON::BI__builtin_neon_vld3q_bf16, NEON::BI__builtin_neon_vld3q_v }, + { NEON::BI__builtin_neon_vld3q_dup_bf16, NEON::BI__builtin_neon_vld3q_dup_v }, + { NEON::BI__builtin_neon_vld3q_lane_bf16, NEON::BI__builtin_neon_vld3q_lane_v }, + { NEON::BI__builtin_neon_vld4_bf16, NEON::BI__builtin_neon_vld4_v }, + { NEON::BI__builtin_neon_vld4_dup_bf16, NEON::BI__builtin_neon_vld4_dup_v }, + { NEON::BI__builtin_neon_vld4_lane_bf16, NEON::BI__builtin_neon_vld4_lane_v }, + { NEON::BI__builtin_neon_vld4q_bf16, NEON::BI__builtin_neon_vld4q_v }, + { NEON::BI__builtin_neon_vld4q_dup_bf16, NEON::BI__builtin_neon_vld4q_dup_v }, + { NEON::BI__builtin_neon_vld4q_lane_bf16, NEON::BI__builtin_neon_vld4q_lane_v }, + { NEON::BI__builtin_neon_vmax_f16, NEON::BI__builtin_neon_vmax_v, }, + { NEON::BI__builtin_neon_vmaxnm_f16, NEON::BI__builtin_neon_vmaxnm_v, }, + { NEON::BI__builtin_neon_vmaxnmq_f16, NEON::BI__builtin_neon_vmaxnmq_v, }, + { NEON::BI__builtin_neon_vmaxq_f16, NEON::BI__builtin_neon_vmaxq_v, }, + { NEON::BI__builtin_neon_vmin_f16, NEON::BI__builtin_neon_vmin_v, }, + { NEON::BI__builtin_neon_vminnm_f16, NEON::BI__builtin_neon_vminnm_v, }, + { NEON::BI__builtin_neon_vminnmq_f16, NEON::BI__builtin_neon_vminnmq_v, }, + { NEON::BI__builtin_neon_vminq_f16, NEON::BI__builtin_neon_vminq_v, }, + { NEON::BI__builtin_neon_vmulx_f16, NEON::BI__builtin_neon_vmulx_v, }, + { NEON::BI__builtin_neon_vmulxq_f16, NEON::BI__builtin_neon_vmulxq_v, }, + { NEON::BI__builtin_neon_vpadd_f16, NEON::BI__builtin_neon_vpadd_v, }, + { NEON::BI__builtin_neon_vpaddq_f16, NEON::BI__builtin_neon_vpaddq_v, }, + { NEON::BI__builtin_neon_vpmax_f16, NEON::BI__builtin_neon_vpmax_v, }, + { NEON::BI__builtin_neon_vpmaxnm_f16, NEON::BI__builtin_neon_vpmaxnm_v, }, + { NEON::BI__builtin_neon_vpmaxnmq_f16, NEON::BI__builtin_neon_vpmaxnmq_v, }, + { NEON::BI__builtin_neon_vpmaxq_f16, NEON::BI__builtin_neon_vpmaxq_v, }, + { NEON::BI__builtin_neon_vpmin_f16, NEON::BI__builtin_neon_vpmin_v, }, + { NEON::BI__builtin_neon_vpminnm_f16, NEON::BI__builtin_neon_vpminnm_v, }, + { NEON::BI__builtin_neon_vpminnmq_f16, NEON::BI__builtin_neon_vpminnmq_v, }, + { NEON::BI__builtin_neon_vpminq_f16, NEON::BI__builtin_neon_vpminq_v, }, + { NEON::BI__builtin_neon_vrecpe_f16, NEON::BI__builtin_neon_vrecpe_v, }, + { NEON::BI__builtin_neon_vrecpeq_f16, NEON::BI__builtin_neon_vrecpeq_v, }, + { NEON::BI__builtin_neon_vrecps_f16, NEON::BI__builtin_neon_vrecps_v, }, + { NEON::BI__builtin_neon_vrecpsq_f16, NEON::BI__builtin_neon_vrecpsq_v, }, + { NEON::BI__builtin_neon_vrnd_f16, NEON::BI__builtin_neon_vrnd_v, }, + { NEON::BI__builtin_neon_vrnda_f16, NEON::BI__builtin_neon_vrnda_v, }, + { NEON::BI__builtin_neon_vrndaq_f16, NEON::BI__builtin_neon_vrndaq_v, }, + { NEON::BI__builtin_neon_vrndi_f16, NEON::BI__builtin_neon_vrndi_v, }, + { NEON::BI__builtin_neon_vrndiq_f16, NEON::BI__builtin_neon_vrndiq_v, }, + { NEON::BI__builtin_neon_vrndm_f16, NEON::BI__builtin_neon_vrndm_v, }, + { NEON::BI__builtin_neon_vrndmq_f16, NEON::BI__builtin_neon_vrndmq_v, }, + { NEON::BI__builtin_neon_vrndn_f16, NEON::BI__builtin_neon_vrndn_v, }, + { NEON::BI__builtin_neon_vrndnq_f16, NEON::BI__builtin_neon_vrndnq_v, }, + { NEON::BI__builtin_neon_vrndp_f16, NEON::BI__builtin_neon_vrndp_v, }, + { NEON::BI__builtin_neon_vrndpq_f16, NEON::BI__builtin_neon_vrndpq_v, }, + { NEON::BI__builtin_neon_vrndq_f16, NEON::BI__builtin_neon_vrndq_v, }, + { NEON::BI__builtin_neon_vrndx_f16, NEON::BI__builtin_neon_vrndx_v, }, + { NEON::BI__builtin_neon_vrndxq_f16, NEON::BI__builtin_neon_vrndxq_v, }, + { NEON::BI__builtin_neon_vrsqrte_f16, NEON::BI__builtin_neon_vrsqrte_v, }, + { NEON::BI__builtin_neon_vrsqrteq_f16, NEON::BI__builtin_neon_vrsqrteq_v, }, + { NEON::BI__builtin_neon_vrsqrts_f16, NEON::BI__builtin_neon_vrsqrts_v, }, + { NEON::BI__builtin_neon_vrsqrtsq_f16, NEON::BI__builtin_neon_vrsqrtsq_v, }, + { NEON::BI__builtin_neon_vsqrt_f16, NEON::BI__builtin_neon_vsqrt_v, }, + { NEON::BI__builtin_neon_vsqrtq_f16, NEON::BI__builtin_neon_vsqrtq_v, }, + { NEON::BI__builtin_neon_vst1_bf16_x2, NEON::BI__builtin_neon_vst1_x2_v }, + { NEON::BI__builtin_neon_vst1_bf16_x3, NEON::BI__builtin_neon_vst1_x3_v }, + { NEON::BI__builtin_neon_vst1_bf16_x4, NEON::BI__builtin_neon_vst1_x4_v }, + { NEON::BI__builtin_neon_vst1_bf16, NEON::BI__builtin_neon_vst1_v }, + { NEON::BI__builtin_neon_vst1_lane_bf16, NEON::BI__builtin_neon_vst1_lane_v }, + { NEON::BI__builtin_neon_vst1q_bf16_x2, NEON::BI__builtin_neon_vst1q_x2_v }, + { NEON::BI__builtin_neon_vst1q_bf16_x3, NEON::BI__builtin_neon_vst1q_x3_v }, + { NEON::BI__builtin_neon_vst1q_bf16_x4, NEON::BI__builtin_neon_vst1q_x4_v }, + { NEON::BI__builtin_neon_vst1q_bf16, NEON::BI__builtin_neon_vst1q_v }, + { NEON::BI__builtin_neon_vst1q_lane_bf16, NEON::BI__builtin_neon_vst1q_lane_v }, + { NEON::BI__builtin_neon_vst2_bf16, NEON::BI__builtin_neon_vst2_v }, + { NEON::BI__builtin_neon_vst2_lane_bf16, NEON::BI__builtin_neon_vst2_lane_v }, + { NEON::BI__builtin_neon_vst2q_bf16, NEON::BI__builtin_neon_vst2q_v }, + { NEON::BI__builtin_neon_vst2q_lane_bf16, NEON::BI__builtin_neon_vst2q_lane_v }, + { NEON::BI__builtin_neon_vst3_bf16, NEON::BI__builtin_neon_vst3_v }, + { NEON::BI__builtin_neon_vst3_lane_bf16, NEON::BI__builtin_neon_vst3_lane_v }, + { NEON::BI__builtin_neon_vst3q_bf16, NEON::BI__builtin_neon_vst3q_v }, + { NEON::BI__builtin_neon_vst3q_lane_bf16, NEON::BI__builtin_neon_vst3q_lane_v }, + { NEON::BI__builtin_neon_vst4_bf16, NEON::BI__builtin_neon_vst4_v }, + { NEON::BI__builtin_neon_vst4_lane_bf16, NEON::BI__builtin_neon_vst4_lane_v }, + { NEON::BI__builtin_neon_vst4q_bf16, NEON::BI__builtin_neon_vst4q_v }, + { NEON::BI__builtin_neon_vst4q_lane_bf16, NEON::BI__builtin_neon_vst4q_lane_v }, + { NEON::BI__builtin_neon_vtrn_f16, NEON::BI__builtin_neon_vtrn_v, }, + { NEON::BI__builtin_neon_vtrnq_f16, NEON::BI__builtin_neon_vtrnq_v, }, + { NEON::BI__builtin_neon_vuzp_f16, NEON::BI__builtin_neon_vuzp_v, }, + { NEON::BI__builtin_neon_vuzpq_f16, NEON::BI__builtin_neon_vuzpq_v, }, + { NEON::BI__builtin_neon_vzip_f16, NEON::BI__builtin_neon_vzip_v, }, + { NEON::BI__builtin_neon_vzipq_f16, NEON::BI__builtin_neon_vzipq_v, }, +}; + #undef NEONMAP0 #undef NEONMAP1 #undef NEONMAP2 @@ -6698,17 +6880,25 @@ HasLegalHalfType); return Usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt") : Builder.CreateSIToFP(Ops[0], Ty, "vcvt"); - case NEON::BI__builtin_neon_vcvt_f16_v: - case NEON::BI__builtin_neon_vcvtq_f16_v: + case NEON::BI__builtin_neon_vcvt_f16_s16: + case NEON::BI__builtin_neon_vcvt_f16_u16: + case NEON::BI__builtin_neon_vcvtq_f16_s16: + case NEON::BI__builtin_neon_vcvtq_f16_u16: Ops[0] = Builder.CreateBitCast(Ops[0], Ty); Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float16, false, Quad), HasLegalHalfType); return Usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt") : Builder.CreateSIToFP(Ops[0], Ty, "vcvt"); - case NEON::BI__builtin_neon_vcvt_n_f16_v: + case NEON::BI__builtin_neon_vcvt_n_f16_s16: + case NEON::BI__builtin_neon_vcvt_n_f16_u16: + case NEON::BI__builtin_neon_vcvtq_n_f16_s16: + case NEON::BI__builtin_neon_vcvtq_n_f16_u16: { + llvm::Type *Tys[2] = { GetFloatNeonType(this, Type), Ty }; + Function *F = CGM.getIntrinsic(Int, Tys); + return EmitNeonCall(F, Ops, "vcvt_n"); + } case NEON::BI__builtin_neon_vcvt_n_f32_v: case NEON::BI__builtin_neon_vcvt_n_f64_v: - case NEON::BI__builtin_neon_vcvtq_n_f16_v: case NEON::BI__builtin_neon_vcvtq_n_f32_v: case NEON::BI__builtin_neon_vcvtq_n_f64_v: { llvm::Type *Tys[2] = { GetFloatNeonType(this, Type), Ty }; @@ -6716,15 +6906,15 @@ Function *F = CGM.getIntrinsic(Int, Tys); return EmitNeonCall(F, Ops, "vcvt_n"); } - case NEON::BI__builtin_neon_vcvt_n_s16_v: + case NEON::BI__builtin_neon_vcvt_n_s16_f16: case NEON::BI__builtin_neon_vcvt_n_s32_v: - case NEON::BI__builtin_neon_vcvt_n_u16_v: + case NEON::BI__builtin_neon_vcvt_n_u16_f16: case NEON::BI__builtin_neon_vcvt_n_u32_v: case NEON::BI__builtin_neon_vcvt_n_s64_v: case NEON::BI__builtin_neon_vcvt_n_u64_v: - case NEON::BI__builtin_neon_vcvtq_n_s16_v: + case NEON::BI__builtin_neon_vcvtq_n_s16_f16: case NEON::BI__builtin_neon_vcvtq_n_s32_v: - case NEON::BI__builtin_neon_vcvtq_n_u16_v: + case NEON::BI__builtin_neon_vcvtq_n_u16_f16: case NEON::BI__builtin_neon_vcvtq_n_u32_v: case NEON::BI__builtin_neon_vcvtq_n_s64_v: case NEON::BI__builtin_neon_vcvtq_n_u64_v: { @@ -6736,64 +6926,64 @@ case NEON::BI__builtin_neon_vcvt_u32_v: case NEON::BI__builtin_neon_vcvt_s64_v: case NEON::BI__builtin_neon_vcvt_u64_v: - case NEON::BI__builtin_neon_vcvt_s16_v: - case NEON::BI__builtin_neon_vcvt_u16_v: + case NEON::BI__builtin_neon_vcvt_s16_f16: + case NEON::BI__builtin_neon_vcvt_u16_f16: case NEON::BI__builtin_neon_vcvtq_s32_v: case NEON::BI__builtin_neon_vcvtq_u32_v: case NEON::BI__builtin_neon_vcvtq_s64_v: case NEON::BI__builtin_neon_vcvtq_u64_v: - case NEON::BI__builtin_neon_vcvtq_s16_v: - case NEON::BI__builtin_neon_vcvtq_u16_v: { + case NEON::BI__builtin_neon_vcvtq_s16_f16: + case NEON::BI__builtin_neon_vcvtq_u16_f16: { Ops[0] = Builder.CreateBitCast(Ops[0], GetFloatNeonType(this, Type)); return Usgn ? Builder.CreateFPToUI(Ops[0], Ty, "vcvt") : Builder.CreateFPToSI(Ops[0], Ty, "vcvt"); } - case NEON::BI__builtin_neon_vcvta_s16_v: + case NEON::BI__builtin_neon_vcvta_s16_f16: case NEON::BI__builtin_neon_vcvta_s32_v: case NEON::BI__builtin_neon_vcvta_s64_v: - case NEON::BI__builtin_neon_vcvta_u16_v: + case NEON::BI__builtin_neon_vcvta_u16_f16: case NEON::BI__builtin_neon_vcvta_u32_v: case NEON::BI__builtin_neon_vcvta_u64_v: - case NEON::BI__builtin_neon_vcvtaq_s16_v: + case NEON::BI__builtin_neon_vcvtaq_s16_f16: case NEON::BI__builtin_neon_vcvtaq_s32_v: case NEON::BI__builtin_neon_vcvtaq_s64_v: - case NEON::BI__builtin_neon_vcvtaq_u16_v: + case NEON::BI__builtin_neon_vcvtaq_u16_f16: case NEON::BI__builtin_neon_vcvtaq_u32_v: case NEON::BI__builtin_neon_vcvtaq_u64_v: - case NEON::BI__builtin_neon_vcvtn_s16_v: + case NEON::BI__builtin_neon_vcvtn_s16_f16: case NEON::BI__builtin_neon_vcvtn_s32_v: case NEON::BI__builtin_neon_vcvtn_s64_v: - case NEON::BI__builtin_neon_vcvtn_u16_v: + case NEON::BI__builtin_neon_vcvtn_u16_f16: case NEON::BI__builtin_neon_vcvtn_u32_v: case NEON::BI__builtin_neon_vcvtn_u64_v: - case NEON::BI__builtin_neon_vcvtnq_s16_v: + case NEON::BI__builtin_neon_vcvtnq_s16_f16: case NEON::BI__builtin_neon_vcvtnq_s32_v: case NEON::BI__builtin_neon_vcvtnq_s64_v: - case NEON::BI__builtin_neon_vcvtnq_u16_v: + case NEON::BI__builtin_neon_vcvtnq_u16_f16: case NEON::BI__builtin_neon_vcvtnq_u32_v: case NEON::BI__builtin_neon_vcvtnq_u64_v: - case NEON::BI__builtin_neon_vcvtp_s16_v: + case NEON::BI__builtin_neon_vcvtp_s16_f16: case NEON::BI__builtin_neon_vcvtp_s32_v: case NEON::BI__builtin_neon_vcvtp_s64_v: - case NEON::BI__builtin_neon_vcvtp_u16_v: + case NEON::BI__builtin_neon_vcvtp_u16_f16: case NEON::BI__builtin_neon_vcvtp_u32_v: case NEON::BI__builtin_neon_vcvtp_u64_v: - case NEON::BI__builtin_neon_vcvtpq_s16_v: + case NEON::BI__builtin_neon_vcvtpq_s16_f16: case NEON::BI__builtin_neon_vcvtpq_s32_v: case NEON::BI__builtin_neon_vcvtpq_s64_v: - case NEON::BI__builtin_neon_vcvtpq_u16_v: + case NEON::BI__builtin_neon_vcvtpq_u16_f16: case NEON::BI__builtin_neon_vcvtpq_u32_v: case NEON::BI__builtin_neon_vcvtpq_u64_v: - case NEON::BI__builtin_neon_vcvtm_s16_v: + case NEON::BI__builtin_neon_vcvtm_s16_f16: case NEON::BI__builtin_neon_vcvtm_s32_v: case NEON::BI__builtin_neon_vcvtm_s64_v: - case NEON::BI__builtin_neon_vcvtm_u16_v: + case NEON::BI__builtin_neon_vcvtm_u16_f16: case NEON::BI__builtin_neon_vcvtm_u32_v: case NEON::BI__builtin_neon_vcvtm_u64_v: - case NEON::BI__builtin_neon_vcvtmq_s16_v: + case NEON::BI__builtin_neon_vcvtmq_s16_f16: case NEON::BI__builtin_neon_vcvtmq_s32_v: case NEON::BI__builtin_neon_vcvtmq_s64_v: - case NEON::BI__builtin_neon_vcvtmq_u16_v: + case NEON::BI__builtin_neon_vcvtmq_u16_f16: case NEON::BI__builtin_neon_vcvtmq_u32_v: case NEON::BI__builtin_neon_vcvtmq_u64_v: { llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) }; @@ -6991,10 +7181,10 @@ case NEON::BI__builtin_neon_vrshrq_n_v: return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshr_n", 1, true); - case NEON::BI__builtin_neon_vsha512hq_v: - case NEON::BI__builtin_neon_vsha512h2q_v: - case NEON::BI__builtin_neon_vsha512su0q_v: - case NEON::BI__builtin_neon_vsha512su1q_v: { + case NEON::BI__builtin_neon_vsha512hq_u64: + case NEON::BI__builtin_neon_vsha512h2q_u64: + case NEON::BI__builtin_neon_vsha512su0q_u64: + case NEON::BI__builtin_neon_vsha512su1q_u64: { Function *F = CGM.getIntrinsic(Int); return EmitNeonCall(F, Ops, ""); } @@ -7046,18 +7236,18 @@ Ops.push_back(getAlignmentValue32(PtrOp0)); return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, ""); } - case NEON::BI__builtin_neon_vsm3partw1q_v: - case NEON::BI__builtin_neon_vsm3partw2q_v: - case NEON::BI__builtin_neon_vsm3ss1q_v: - case NEON::BI__builtin_neon_vsm4ekeyq_v: - case NEON::BI__builtin_neon_vsm4eq_v: { + case NEON::BI__builtin_neon_vsm3partw1q_u32: + case NEON::BI__builtin_neon_vsm3partw2q_u32: + case NEON::BI__builtin_neon_vsm3ss1q_u32: + case NEON::BI__builtin_neon_vsm4ekeyq_u32: + case NEON::BI__builtin_neon_vsm4eq_u32: { Function *F = CGM.getIntrinsic(Int); return EmitNeonCall(F, Ops, ""); } - case NEON::BI__builtin_neon_vsm3tt1aq_v: - case NEON::BI__builtin_neon_vsm3tt1bq_v: - case NEON::BI__builtin_neon_vsm3tt2aq_v: - case NEON::BI__builtin_neon_vsm3tt2bq_v: { + case NEON::BI__builtin_neon_vsm3tt1aq_u32: + case NEON::BI__builtin_neon_vsm3tt1bq_u32: + case NEON::BI__builtin_neon_vsm3tt2aq_u32: + case NEON::BI__builtin_neon_vsm3tt2bq_u32: { Function *F = CGM.getIntrinsic(Int); Ops[3] = Builder.CreateZExt(Ops[3], Int64Ty); return EmitNeonCall(F, Ops, ""); @@ -7143,7 +7333,7 @@ } return SV; } - case NEON::BI__builtin_neon_vxarq_v: { + case NEON::BI__builtin_neon_vxarq_u64: { Function *F = CGM.getIntrinsic(Int); Ops[2] = Builder.CreateZExt(Ops[2], Int64Ty); return EmitNeonCall(F, Ops, ""); @@ -7167,70 +7357,71 @@ } return SV; } - case NEON::BI__builtin_neon_vdot_v: - case NEON::BI__builtin_neon_vdotq_v: { + case NEON::BI__builtin_neon_vdot_s32: + case NEON::BI__builtin_neon_vdot_u32: + case NEON::BI__builtin_neon_vdotq_s32: + case NEON::BI__builtin_neon_vdotq_u32: { auto *InputTy = llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8); llvm::Type *Tys[2] = { Ty, InputTy }; - Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic; return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vdot"); } - case NEON::BI__builtin_neon_vfmlal_low_v: - case NEON::BI__builtin_neon_vfmlalq_low_v: { + case NEON::BI__builtin_neon_vfmlal_low_f16: + case NEON::BI__builtin_neon_vfmlalq_low_f16: { auto *InputTy = llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16); llvm::Type *Tys[2] = { Ty, InputTy }; return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlal_low"); } - case NEON::BI__builtin_neon_vfmlsl_low_v: - case NEON::BI__builtin_neon_vfmlslq_low_v: { + case NEON::BI__builtin_neon_vfmlsl_low_f16: + case NEON::BI__builtin_neon_vfmlslq_low_f16: { auto *InputTy = llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16); llvm::Type *Tys[2] = { Ty, InputTy }; return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlsl_low"); } - case NEON::BI__builtin_neon_vfmlal_high_v: - case NEON::BI__builtin_neon_vfmlalq_high_v: { + case NEON::BI__builtin_neon_vfmlal_high_f16: + case NEON::BI__builtin_neon_vfmlalq_high_f16: { auto *InputTy = llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16); llvm::Type *Tys[2] = { Ty, InputTy }; return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlal_high"); } - case NEON::BI__builtin_neon_vfmlsl_high_v: - case NEON::BI__builtin_neon_vfmlslq_high_v: { + case NEON::BI__builtin_neon_vfmlsl_high_f16: + case NEON::BI__builtin_neon_vfmlslq_high_f16: { auto *InputTy = llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16); llvm::Type *Tys[2] = { Ty, InputTy }; return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlsl_high"); } - case NEON::BI__builtin_neon_vmmlaq_v: { + case NEON::BI__builtin_neon_vmmlaq_s32: + case NEON::BI__builtin_neon_vmmlaq_u32: { auto *InputTy = llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8); llvm::Type *Tys[2] = { Ty, InputTy }; - Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic; - return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmmla"); + return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, "vmmla"); } - case NEON::BI__builtin_neon_vusmmlaq_v: { + case NEON::BI__builtin_neon_vusmmlaq_s32: { auto *InputTy = llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8); llvm::Type *Tys[2] = { Ty, InputTy }; return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vusmmla"); } - case NEON::BI__builtin_neon_vusdot_v: - case NEON::BI__builtin_neon_vusdotq_v: { + case NEON::BI__builtin_neon_vusdot_s32: + case NEON::BI__builtin_neon_vusdotq_s32: { auto *InputTy = llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8); llvm::Type *Tys[2] = { Ty, InputTy }; return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vusdot"); } - case NEON::BI__builtin_neon_vbfdot_v: - case NEON::BI__builtin_neon_vbfdotq_v: { + case NEON::BI__builtin_neon_vbfdot_f32: + case NEON::BI__builtin_neon_vbfdotq_f32: { llvm::Type *InputTy = llvm::FixedVectorType::get(BFloatTy, Ty->getPrimitiveSizeInBits() / 16); llvm::Type *Tys[2] = { Ty, InputTy }; return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfdot"); } - case NEON::BI__builtin_neon___a32_vcvt_bf16_v: { + case NEON::BI__builtin_neon___a32_vcvt_bf16_f32: { llvm::Type *Tys[1] = { Ty }; Function *F = CGM.getIntrinsic(Int, Tys); return EmitNeonCall(F, Ops, "vcvtfp2bf"); @@ -7820,6 +8011,13 @@ if (Value *Result = EmitARMCDEBuiltinExpr(BuiltinID, E, ReturnValue, Arch)) return Result; + // Some intrinsics are equivalent - if they are use the base intrinsic ID. + auto It = llvm::find_if(NEONEquivalentIntrinsicMap, [BuiltinID](auto &P) { + return P.first == BuiltinID; + }); + if (It != end(NEONEquivalentIntrinsicMap)) + BuiltinID = It->second; + // Find out if any arguments are required to be integer constant // expressions. unsigned ICEArguments = 0; @@ -10037,6 +10235,13 @@ if (Optional MsvcIntId = translateAarch64ToMsvcIntrin(BuiltinID)) return EmitMSVCBuiltinExpr(*MsvcIntId, E); + // Some intrinsics are equivalent - if they are use the base intrinsic ID. + auto It = llvm::find_if(NEONEquivalentIntrinsicMap, [BuiltinID](auto &P) { + return P.first == BuiltinID; + }); + if (It != end(NEONEquivalentIntrinsicMap)) + BuiltinID = It->second; + // Find out if any arguments are required to be integer constant // expressions. unsigned ICEArguments = 0; @@ -11133,26 +11338,26 @@ case NEON::BI__builtin_neon_vcvt_u32_v: case NEON::BI__builtin_neon_vcvt_s64_v: case NEON::BI__builtin_neon_vcvt_u64_v: - case NEON::BI__builtin_neon_vcvt_s16_v: - case NEON::BI__builtin_neon_vcvt_u16_v: + case NEON::BI__builtin_neon_vcvt_s16_f16: + case NEON::BI__builtin_neon_vcvt_u16_f16: case NEON::BI__builtin_neon_vcvtq_s32_v: case NEON::BI__builtin_neon_vcvtq_u32_v: case NEON::BI__builtin_neon_vcvtq_s64_v: case NEON::BI__builtin_neon_vcvtq_u64_v: - case NEON::BI__builtin_neon_vcvtq_s16_v: - case NEON::BI__builtin_neon_vcvtq_u16_v: { + case NEON::BI__builtin_neon_vcvtq_s16_f16: + case NEON::BI__builtin_neon_vcvtq_u16_f16: { Int = usgn ? Intrinsic::aarch64_neon_fcvtzu : Intrinsic::aarch64_neon_fcvtzs; llvm::Type *Tys[2] = {Ty, GetFloatNeonType(this, Type)}; return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtz"); } - case NEON::BI__builtin_neon_vcvta_s16_v: - case NEON::BI__builtin_neon_vcvta_u16_v: + case NEON::BI__builtin_neon_vcvta_s16_f16: + case NEON::BI__builtin_neon_vcvta_u16_f16: case NEON::BI__builtin_neon_vcvta_s32_v: - case NEON::BI__builtin_neon_vcvtaq_s16_v: + case NEON::BI__builtin_neon_vcvtaq_s16_f16: case NEON::BI__builtin_neon_vcvtaq_s32_v: case NEON::BI__builtin_neon_vcvta_u32_v: - case NEON::BI__builtin_neon_vcvtaq_u16_v: + case NEON::BI__builtin_neon_vcvtaq_u16_f16: case NEON::BI__builtin_neon_vcvtaq_u32_v: case NEON::BI__builtin_neon_vcvta_s64_v: case NEON::BI__builtin_neon_vcvtaq_s64_v: @@ -11162,13 +11367,13 @@ llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) }; return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvta"); } - case NEON::BI__builtin_neon_vcvtm_s16_v: + case NEON::BI__builtin_neon_vcvtm_s16_f16: case NEON::BI__builtin_neon_vcvtm_s32_v: - case NEON::BI__builtin_neon_vcvtmq_s16_v: + case NEON::BI__builtin_neon_vcvtmq_s16_f16: case NEON::BI__builtin_neon_vcvtmq_s32_v: - case NEON::BI__builtin_neon_vcvtm_u16_v: + case NEON::BI__builtin_neon_vcvtm_u16_f16: case NEON::BI__builtin_neon_vcvtm_u32_v: - case NEON::BI__builtin_neon_vcvtmq_u16_v: + case NEON::BI__builtin_neon_vcvtmq_u16_f16: case NEON::BI__builtin_neon_vcvtmq_u32_v: case NEON::BI__builtin_neon_vcvtm_s64_v: case NEON::BI__builtin_neon_vcvtmq_s64_v: @@ -11178,13 +11383,13 @@ llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) }; return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtm"); } - case NEON::BI__builtin_neon_vcvtn_s16_v: + case NEON::BI__builtin_neon_vcvtn_s16_f16: case NEON::BI__builtin_neon_vcvtn_s32_v: - case NEON::BI__builtin_neon_vcvtnq_s16_v: + case NEON::BI__builtin_neon_vcvtnq_s16_f16: case NEON::BI__builtin_neon_vcvtnq_s32_v: - case NEON::BI__builtin_neon_vcvtn_u16_v: + case NEON::BI__builtin_neon_vcvtn_u16_f16: case NEON::BI__builtin_neon_vcvtn_u32_v: - case NEON::BI__builtin_neon_vcvtnq_u16_v: + case NEON::BI__builtin_neon_vcvtnq_u16_f16: case NEON::BI__builtin_neon_vcvtnq_u32_v: case NEON::BI__builtin_neon_vcvtn_s64_v: case NEON::BI__builtin_neon_vcvtnq_s64_v: @@ -11194,13 +11399,13 @@ llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) }; return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtn"); } - case NEON::BI__builtin_neon_vcvtp_s16_v: + case NEON::BI__builtin_neon_vcvtp_s16_f16: case NEON::BI__builtin_neon_vcvtp_s32_v: - case NEON::BI__builtin_neon_vcvtpq_s16_v: + case NEON::BI__builtin_neon_vcvtpq_s16_f16: case NEON::BI__builtin_neon_vcvtpq_s32_v: - case NEON::BI__builtin_neon_vcvtp_u16_v: + case NEON::BI__builtin_neon_vcvtp_u16_f16: case NEON::BI__builtin_neon_vcvtp_u32_v: - case NEON::BI__builtin_neon_vcvtpq_u16_v: + case NEON::BI__builtin_neon_vcvtpq_u16_f16: case NEON::BI__builtin_neon_vcvtpq_u32_v: case NEON::BI__builtin_neon_vcvtp_s64_v: case NEON::BI__builtin_neon_vcvtpq_s64_v: Index: clang/test/CodeGen/aarch64-neon-sm4-sm3.c =================================================================== --- clang/test/CodeGen/aarch64-neon-sm4-sm3.c +++ clang/test/CodeGen/aarch64-neon-sm4-sm3.c @@ -2,7 +2,7 @@ // RUN: -target-feature +sm4 -S -emit-llvm -fallow-half-arguments-and-returns -o - %s \ // RUN: | FileCheck %s -// RUN: not %clang_cc1 -Wno-error=implicit-function-declaration -triple aarch64-linux-gnu -target-feature +neon \ +// RUN: not %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon \ // RUN: -S -emit-llvm -fallow-half-arguments-and-returns -o - %s 2>&1 | FileCheck --check-prefix=CHECK-NO-CRYPTO %s // REQUIRES: aarch64-registered-target || arm-registered-target @@ -11,7 +11,7 @@ void test_vsm3partw1(uint32x4_t a, uint32x4_t b, uint32x4_t c) { // CHECK-LABEL: @test_vsm3partw1( - // CHECK-NO-CRYPTO: warning: call to undeclared function 'vsm3partw1q_u32' + // CHECK-NO-CRYPTO: error: always_inline function 'vsm3partw1q_u32' requires target feature 'sm4' // CHECK: call <4 x i32> @llvm.aarch64.crypto.sm3partw1 uint32x4_t result = vsm3partw1q_u32(a, b, c); } Index: clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c =================================================================== --- clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c +++ clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c @@ -1469,14 +1469,14 @@ // CHECK-LABEL: define {{[^@]+}}@test_vfma_n_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x half> undef, half [[C]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x half> [[VECINIT_I]], half [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x half> [[VECINIT1_I]], half [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x half> [[VECINIT2_I]], half [[C]], i32 3 +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[C]], i32 1 +// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[C]], i32 2 +// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[C]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[B]], <4 x half> [[VECINIT3_I]], <4 x half> [[A]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[VECINIT3]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[B]], <4 x half> [[VECINIT3]], <4 x half> [[A]]) // CHECK-NEXT: ret <4 x half> [[TMP3]] // float16x4_t test_vfma_n_f16(float16x4_t a, float16x4_t b, float16_t c) { @@ -1486,18 +1486,18 @@ // CHECK-LABEL: define {{[^@]+}}@test_vfmaq_n_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR1]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x half> undef, half [[C]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x half> [[VECINIT_I]], half [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x half> [[VECINIT1_I]], half [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x half> [[VECINIT2_I]], half [[C]], i32 3 -// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x half> [[VECINIT3_I]], half [[C]], i32 4 -// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x half> [[VECINIT4_I]], half [[C]], i32 5 -// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x half> [[VECINIT5_I]], half [[C]], i32 6 -// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x half> [[VECINIT6_I]], half [[C]], i32 7 +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[C]], i32 1 +// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[C]], i32 2 +// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[C]], i32 3 +// CHECK-NEXT: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[C]], i32 4 +// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[C]], i32 5 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[C]], i32 6 +// CHECK-NEXT: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[C]], i32 7 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[VECINIT7_I]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[VECINIT7_I]], <8 x half> [[A]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[VECINIT7]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[VECINIT7]], <8 x half> [[A]]) // CHECK-NEXT: ret <8 x half> [[TMP3]] // float16x8_t test_vfmaq_n_f16(float16x8_t a, float16x8_t b, float16_t c) { @@ -1601,15 +1601,15 @@ // CHECK-LABEL: define {{[^@]+}}@test_vfms_n_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <4 x half> [[B]] -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x half> undef, half [[C]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x half> [[VECINIT_I]], half [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x half> [[VECINIT1_I]], half [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x half> [[VECINIT2_I]], half [[C]], i32 3 +// CHECK-NEXT: [[FNEG:%.*]] = fneg <4 x half> [[B]] +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[C]], i32 1 +// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[C]], i32 2 +// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[C]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[FNEG_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FNEG_I]], <4 x half> [[VECINIT3_I]], <4 x half> [[A]]) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[FNEG]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[VECINIT3]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FNEG]], <4 x half> [[VECINIT3]], <4 x half> [[A]]) // CHECK-NEXT: ret <4 x half> [[TMP3]] // float16x4_t test_vfms_n_f16(float16x4_t a, float16x4_t b, float16_t c) { @@ -1619,19 +1619,19 @@ // CHECK-LABEL: define {{[^@]+}}@test_vfmsq_n_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR1]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <8 x half> [[B]] -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x half> undef, half [[C]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x half> [[VECINIT_I]], half [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x half> [[VECINIT1_I]], half [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x half> [[VECINIT2_I]], half [[C]], i32 3 -// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x half> [[VECINIT3_I]], half [[C]], i32 4 -// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x half> [[VECINIT4_I]], half [[C]], i32 5 -// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x half> [[VECINIT5_I]], half [[C]], i32 6 -// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x half> [[VECINIT6_I]], half [[C]], i32 7 +// CHECK-NEXT: [[FNEG:%.*]] = fneg <8 x half> [[B]] +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[C]], i32 1 +// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[C]], i32 2 +// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[C]], i32 3 +// CHECK-NEXT: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[C]], i32 4 +// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[C]], i32 5 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[C]], i32 6 +// CHECK-NEXT: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[C]], i32 7 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[FNEG_I]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[VECINIT7_I]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FNEG_I]], <8 x half> [[VECINIT7_I]], <8 x half> [[A]]) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[FNEG]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[VECINIT7]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FNEG]], <8 x half> [[VECINIT7]], <8 x half> [[A]]) // CHECK-NEXT: ret <8 x half> [[TMP3]] // float16x8_t test_vfmsq_n_f16(float16x8_t a, float16x8_t b, float16_t c) { @@ -1721,12 +1721,12 @@ // CHECK-LABEL: define {{[^@]+}}@test_vmul_n_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], half noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x half> undef, half [[B]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x half> [[VECINIT_I]], half [[B]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x half> [[VECINIT1_I]], half [[B]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x half> [[VECINIT2_I]], half [[B]], i32 3 -// CHECK-NEXT: [[MUL_I:%.*]] = fmul <4 x half> [[A]], [[VECINIT3_I]] -// CHECK-NEXT: ret <4 x half> [[MUL_I]] +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[B]], i32 0 +// CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[B]], i32 1 +// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[B]], i32 2 +// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[B]], i32 3 +// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x half> [[A]], [[VECINIT3]] +// CHECK-NEXT: ret <4 x half> [[MUL]] // float16x4_t test_vmul_n_f16(float16x4_t a, float16_t b) { return vmul_n_f16(a, b); @@ -1735,16 +1735,16 @@ // CHECK-LABEL: define {{[^@]+}}@test_vmulq_n_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], half noundef [[B:%.*]]) #[[ATTR1]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x half> undef, half [[B]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x half> [[VECINIT_I]], half [[B]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x half> [[VECINIT1_I]], half [[B]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x half> [[VECINIT2_I]], half [[B]], i32 3 -// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x half> [[VECINIT3_I]], half [[B]], i32 4 -// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x half> [[VECINIT4_I]], half [[B]], i32 5 -// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x half> [[VECINIT5_I]], half [[B]], i32 6 -// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x half> [[VECINIT6_I]], half [[B]], i32 7 -// CHECK-NEXT: [[MUL_I:%.*]] = fmul <8 x half> [[A]], [[VECINIT7_I]] -// CHECK-NEXT: ret <8 x half> [[MUL_I]] +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[B]], i32 0 +// CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[B]], i32 1 +// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[B]], i32 2 +// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[B]], i32 3 +// CHECK-NEXT: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[B]], i32 4 +// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[B]], i32 5 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[B]], i32 6 +// CHECK-NEXT: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[B]], i32 7 +// CHECK-NEXT: [[MUL:%.*]] = fmul <8 x half> [[A]], [[VECINIT7]] +// CHECK-NEXT: ret <8 x half> [[MUL]] // float16x8_t test_vmulq_n_f16(float16x8_t a, float16_t b) { return vmulq_n_f16(a, b); @@ -1754,15 +1754,15 @@ // CHECK-LABEL: define {{[^@]+}}@test_vmulh_lane_f16 // CHECK-SAME: (half noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_847:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_847:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_831:%.*]] = alloca <4 x half>, align 8 +// CHECK-NEXT: [[__REINT1_831:%.*]] = alloca i16, align 2 // CHECK-NEXT: [[CONV:%.*]] = fpext half [[A]] to float -// CHECK-NEXT: store <4 x half> [[B]], <4 x half>* [[__REINT_847]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_847]] to <4 x i16>* +// CHECK-NEXT: store <4 x half> [[B]], <4 x half>* [[__REINT_831]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_831]] to <4 x i16>* // CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE]], i16* [[__REINT1_847]], align 2 -// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_847]] to half* +// CHECK-NEXT: store i16 [[VGET_LANE]], i16* [[__REINT1_831]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_831]] to half* // CHECK-NEXT: [[TMP3:%.*]] = load half, half* [[TMP2]], align 2 // CHECK-NEXT: [[CONV2:%.*]] = fpext half [[TMP3]] to float // CHECK-NEXT: [[MUL:%.*]] = fmul float [[CONV]], [[CONV2]] @@ -1776,15 +1776,15 @@ // CHECK-LABEL: define {{[^@]+}}@test_vmulh_laneq_f16 // CHECK-SAME: (half noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_850:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_850:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__REINT_834:%.*]] = alloca <8 x half>, align 16 +// CHECK-NEXT: [[__REINT1_834:%.*]] = alloca i16, align 2 // CHECK-NEXT: [[CONV:%.*]] = fpext half [[A]] to float -// CHECK-NEXT: store <8 x half> [[B]], <8 x half>* [[__REINT_850]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_850]] to <8 x i16>* +// CHECK-NEXT: store <8 x half> [[B]], <8 x half>* [[__REINT_834]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_834]] to <8 x i16>* // CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE]], i16* [[__REINT1_850]], align 2 -// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_850]] to half* +// CHECK-NEXT: store i16 [[VGETQ_LANE]], i16* [[__REINT1_834]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_834]] to half* // CHECK-NEXT: [[TMP3:%.*]] = load half, half* [[TMP2]], align 2 // CHECK-NEXT: [[CONV2:%.*]] = fpext half [[TMP3]] to float // CHECK-NEXT: [[MUL:%.*]] = fmul float [[CONV]], [[CONV2]] @@ -1858,14 +1858,14 @@ // CHECK-LABEL: define {{[^@]+}}@test_vmulx_n_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], half noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x half> undef, half [[B]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x half> [[VECINIT_I]], half [[B]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x half> [[VECINIT1_I]], half [[B]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x half> [[VECINIT2_I]], half [[B]], i32 3 +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[B]], i32 0 +// CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[B]], i32 1 +// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[B]], i32 2 +// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[B]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VMULX2_I_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[VECINIT3_I]]) -// CHECK-NEXT: ret <4 x half> [[VMULX2_I_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[VECINIT3]] to <8 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[VECINIT3]]) +// CHECK-NEXT: ret <4 x half> [[VMULX2_I]] // float16x4_t test_vmulx_n_f16(float16x4_t a, float16_t b) { return vmulx_n_f16(a, b); @@ -1874,18 +1874,18 @@ // CHECK-LABEL: define {{[^@]+}}@test_vmulxq_n_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], half noundef [[B:%.*]]) #[[ATTR1]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x half> undef, half [[B]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x half> [[VECINIT_I]], half [[B]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x half> [[VECINIT1_I]], half [[B]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x half> [[VECINIT2_I]], half [[B]], i32 3 -// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x half> [[VECINIT3_I]], half [[B]], i32 4 -// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x half> [[VECINIT4_I]], half [[B]], i32 5 -// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x half> [[VECINIT5_I]], half [[B]], i32 6 -// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x half> [[VECINIT6_I]], half [[B]], i32 7 +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[B]], i32 0 +// CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[B]], i32 1 +// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[B]], i32 2 +// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[B]], i32 3 +// CHECK-NEXT: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[B]], i32 4 +// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[B]], i32 5 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[B]], i32 6 +// CHECK-NEXT: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[B]], i32 7 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[VECINIT7_I]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[VECINIT7_I]]) -// CHECK-NEXT: ret <8 x half> [[VMULX2_I_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[VECINIT7]] to <16 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[VECINIT7]]) +// CHECK-NEXT: ret <8 x half> [[VMULX2_I]] // float16x8_t test_vmulxq_n_f16(float16x8_t a, float16_t b) { return vmulxq_n_f16(a, b); @@ -1917,8 +1917,9 @@ // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VMAXV1_I:%.*]] = call half @llvm.aarch64.neon.fmaxv.f16.v4f16(<4 x half> [[A]]) -// CHECK-NEXT: ret half [[VMAXV1_I]] +// CHECK-NEXT: [[VMAXV:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK-NEXT: [[VMAXV1:%.*]] = call half @llvm.aarch64.neon.fmaxv.f16.v4f16(<4 x half> [[VMAXV]]) +// CHECK-NEXT: ret half [[VMAXV1]] // float16_t test_vmaxv_f16(float16x4_t a) { return vmaxv_f16(a); @@ -1928,8 +1929,9 @@ // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VMAXV1_I:%.*]] = call half @llvm.aarch64.neon.fmaxv.f16.v8f16(<8 x half> [[A]]) -// CHECK-NEXT: ret half [[VMAXV1_I]] +// CHECK-NEXT: [[VMAXV:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK-NEXT: [[VMAXV1:%.*]] = call half @llvm.aarch64.neon.fmaxv.f16.v8f16(<8 x half> [[VMAXV]]) +// CHECK-NEXT: ret half [[VMAXV1]] // float16_t test_vmaxvq_f16(float16x8_t a) { return vmaxvq_f16(a); @@ -1939,8 +1941,9 @@ // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VMINV1_I:%.*]] = call half @llvm.aarch64.neon.fminv.f16.v4f16(<4 x half> [[A]]) -// CHECK-NEXT: ret half [[VMINV1_I]] +// CHECK-NEXT: [[VMINV:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK-NEXT: [[VMINV1:%.*]] = call half @llvm.aarch64.neon.fminv.f16.v4f16(<4 x half> [[VMINV]]) +// CHECK-NEXT: ret half [[VMINV1]] // float16_t test_vminv_f16(float16x4_t a) { return vminv_f16(a); @@ -1950,8 +1953,9 @@ // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VMINV1_I:%.*]] = call half @llvm.aarch64.neon.fminv.f16.v8f16(<8 x half> [[A]]) -// CHECK-NEXT: ret half [[VMINV1_I]] +// CHECK-NEXT: [[VMINV:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK-NEXT: [[VMINV1:%.*]] = call half @llvm.aarch64.neon.fminv.f16.v8f16(<8 x half> [[VMINV]]) +// CHECK-NEXT: ret half [[VMINV1]] // float16_t test_vminvq_f16(float16x8_t a) { return vminvq_f16(a); @@ -1961,8 +1965,9 @@ // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VMAXNMV1_I:%.*]] = call half @llvm.aarch64.neon.fmaxnmv.f16.v4f16(<4 x half> [[A]]) -// CHECK-NEXT: ret half [[VMAXNMV1_I]] +// CHECK-NEXT: [[VMAXNMV:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK-NEXT: [[VMAXNMV1:%.*]] = call half @llvm.aarch64.neon.fmaxnmv.f16.v4f16(<4 x half> [[VMAXNMV]]) +// CHECK-NEXT: ret half [[VMAXNMV1]] // float16_t test_vmaxnmv_f16(float16x4_t a) { return vmaxnmv_f16(a); @@ -1972,8 +1977,9 @@ // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VMAXNMV1_I:%.*]] = call half @llvm.aarch64.neon.fmaxnmv.f16.v8f16(<8 x half> [[A]]) -// CHECK-NEXT: ret half [[VMAXNMV1_I]] +// CHECK-NEXT: [[VMAXNMV:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK-NEXT: [[VMAXNMV1:%.*]] = call half @llvm.aarch64.neon.fmaxnmv.f16.v8f16(<8 x half> [[VMAXNMV]]) +// CHECK-NEXT: ret half [[VMAXNMV1]] // float16_t test_vmaxnmvq_f16(float16x8_t a) { return vmaxnmvq_f16(a); @@ -1983,8 +1989,9 @@ // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VMINNMV1_I:%.*]] = call half @llvm.aarch64.neon.fminnmv.f16.v4f16(<4 x half> [[A]]) -// CHECK-NEXT: ret half [[VMINNMV1_I]] +// CHECK-NEXT: [[VMINNMV:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK-NEXT: [[VMINNMV1:%.*]] = call half @llvm.aarch64.neon.fminnmv.f16.v4f16(<4 x half> [[VMINNMV]]) +// CHECK-NEXT: ret half [[VMINNMV1]] // float16_t test_vminnmv_f16(float16x4_t a) { return vminnmv_f16(a); @@ -1994,8 +2001,9 @@ // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VMINNMV1_I:%.*]] = call half @llvm.aarch64.neon.fminnmv.f16.v8f16(<8 x half> [[A]]) -// CHECK-NEXT: ret half [[VMINNMV1_I]] +// CHECK-NEXT: [[VMINNMV:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK-NEXT: [[VMINNMV1:%.*]] = call half @llvm.aarch64.neon.fminnmv.f16.v8f16(<8 x half> [[VMINNMV]]) +// CHECK-NEXT: ret half [[VMINNMV1]] // float16_t test_vminnmvq_f16(float16x8_t a) { return vminnmvq_f16(a); @@ -2192,11 +2200,11 @@ // CHECK-LABEL: define {{[^@]+}}@test_vmov_n_f16 // CHECK-SAME: (half noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x half> undef, half [[A]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x half> [[VECINIT_I]], half [[A]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x half> [[VECINIT1_I]], half [[A]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x half> [[VECINIT2_I]], half [[A]], i32 3 -// CHECK-NEXT: ret <4 x half> [[VECINIT3_I]] +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[A]], i32 1 +// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[A]], i32 2 +// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[A]], i32 3 +// CHECK-NEXT: ret <4 x half> [[VECINIT3]] // float16x4_t test_vmov_n_f16(float16_t a) { return vmov_n_f16(a); @@ -2205,15 +2213,15 @@ // CHECK-LABEL: define {{[^@]+}}@test_vmovq_n_f16 // CHECK-SAME: (half noundef [[A:%.*]]) #[[ATTR1]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x half> undef, half [[A]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x half> [[VECINIT_I]], half [[A]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x half> [[VECINIT1_I]], half [[A]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x half> [[VECINIT2_I]], half [[A]], i32 3 -// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x half> [[VECINIT3_I]], half [[A]], i32 4 -// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x half> [[VECINIT4_I]], half [[A]], i32 5 -// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x half> [[VECINIT5_I]], half [[A]], i32 6 -// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x half> [[VECINIT6_I]], half [[A]], i32 7 -// CHECK-NEXT: ret <8 x half> [[VECINIT7_I]] +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[A]], i32 1 +// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[A]], i32 2 +// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[A]], i32 3 +// CHECK-NEXT: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[A]], i32 4 +// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[A]], i32 5 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[A]], i32 6 +// CHECK-NEXT: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[A]], i32 7 +// CHECK-NEXT: ret <8 x half> [[VECINIT7]] // float16x8_t test_vmovq_n_f16(float16_t a) { return vmovq_n_f16(a); @@ -2222,11 +2230,11 @@ // CHECK-LABEL: define {{[^@]+}}@test_vdup_n_f16 // CHECK-SAME: (half noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x half> undef, half [[A]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x half> [[VECINIT_I]], half [[A]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x half> [[VECINIT1_I]], half [[A]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x half> [[VECINIT2_I]], half [[A]], i32 3 -// CHECK-NEXT: ret <4 x half> [[VECINIT3_I]] +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[A]], i32 1 +// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[A]], i32 2 +// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[A]], i32 3 +// CHECK-NEXT: ret <4 x half> [[VECINIT3]] // float16x4_t test_vdup_n_f16(float16_t a) { return vdup_n_f16(a); @@ -2235,15 +2243,15 @@ // CHECK-LABEL: define {{[^@]+}}@test_vdupq_n_f16 // CHECK-SAME: (half noundef [[A:%.*]]) #[[ATTR1]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x half> undef, half [[A]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x half> [[VECINIT_I]], half [[A]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x half> [[VECINIT1_I]], half [[A]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x half> [[VECINIT2_I]], half [[A]], i32 3 -// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x half> [[VECINIT3_I]], half [[A]], i32 4 -// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x half> [[VECINIT4_I]], half [[A]], i32 5 -// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x half> [[VECINIT5_I]], half [[A]], i32 6 -// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x half> [[VECINIT6_I]], half [[A]], i32 7 -// CHECK-NEXT: ret <8 x half> [[VECINIT7_I]] +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[A]], i32 1 +// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[A]], i32 2 +// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[A]], i32 3 +// CHECK-NEXT: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[A]], i32 4 +// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[A]], i32 5 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[A]], i32 6 +// CHECK-NEXT: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[A]], i32 7 +// CHECK-NEXT: ret <8 x half> [[VECINIT7]] // float16x8_t test_vdupq_n_f16(float16_t a) { return vdupq_n_f16(a); Index: clang/test/CodeGen/neon-crypto.c =================================================================== --- clang/test/CodeGen/neon-crypto.c +++ clang/test/CodeGen/neon-crypto.c @@ -6,7 +6,7 @@ // RUN: -target-feature +sha2 -target-feature +aes \ // RUN: -emit-llvm -O1 -fallow-half-arguments-and-returns -o - %s | FileCheck %s // RUN: not %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ -// RUN: -S -O3 -o -fallow-half-arguments-and-returns - %s 2>&1 | FileCheck --check-prefix=CHECK-NO-CRYPTO %s +// RUN: -S -O3 -fallow-half-arguments-and-returns -o - %s 2>&1 | FileCheck --check-prefix=CHECK-NO-CRYPTO %s // REQUIRES: aarch64-registered-target || arm-registered-target @@ -14,7 +14,7 @@ uint8x16_t test_vaeseq_u8(uint8x16_t data, uint8x16_t key) { // CHECK-LABEL: @test_vaeseq_u8 - // CHECK-NO-CRYPTO: error: call to undeclared function 'vaeseq_u8' + // CHECK-NO-CRYPTO: error: always_inline function 'vaeseq_u8' requires target feature 'aes' return vaeseq_u8(data, key); // CHECK: call <16 x i8> @llvm.{{arm.neon|aarch64.crypto}}.aese(<16 x i8> %data, <16 x i8> %key) } Index: clang/test/Sema/aarch64-neon-target.c =================================================================== --- /dev/null +++ clang/test/Sema/aarch64-neon-target.c @@ -0,0 +1,66 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -fallow-half-arguments-and-returns -fsyntax-only -verify -emit-llvm -o - %s +// REQUIRES: aarch64-registered-target + +// Test that functions with the correct target attributes can use the correct NEON intrinsics. + +#include + +__attribute__((target("dotprod"))) +void dotprod(uint32x2_t v2i32, uint8x16_t v16i8, uint8x8_t v8i8) { + vdot_u32(v2i32, v8i8, v8i8); + vdot_laneq_u32(v2i32, v8i8, v16i8, 1); +} + +__attribute__((target("fullfp16"))) +void fp16(uint32x2_t v2i32, uint32x4_t v4i32, uint16x8_t v8i16, uint8x16_t v16i8, uint8x8_t v8i8, float32x2_t v2f32, float32x4_t v4f32, float16x4_t v4f16, bfloat16x4_t v4bf16) { + vceqz_f16(v4f16); + vrnd_f16(v4f16); + vmaxnm_f16(v4f16, v4f16); + vrndi_f16(v4f16); +} + +__attribute__((target("fp16fml"))) +void fp16fml(float32x2_t v2f32, float16x4_t v4f16) { + vfmlal_low_f16(v2f32, v4f16, v4f16); +} + +__attribute__((target("i8mm"))) +void i8mm(uint32x2_t v2i32, uint32x4_t v4i32, uint16x8_t v8i16, uint8x16_t v16i8, uint8x8_t v8i8, float32x2_t v2f32, float32x4_t v4f32, float16x4_t v4f16, bfloat16x4_t v4bf16) { + vmmlaq_s32(v4i32, v8i16, v8i16); + vusdot_laneq_s32(v2i32, v8i8, v8i16, 0); +} + +__attribute__((target("bf16"))) +void bf16(uint32x2_t v2i32, uint32x4_t v4i32, uint16x8_t v8i16, uint8x16_t v16i8, uint8x8_t v8i8, float32x2_t v2f32, float32x4_t v4f32, float16x4_t v4f16, bfloat16x4_t v4bf16, __bf16 bf16) { + vbfdot_f32(v2f32, v4bf16, v4bf16); + vcreate_bf16(10); + vdup_lane_bf16(v4bf16, 2); + vdup_n_bf16(bf16); + vld1_bf16(0); + vcvt_f32_bf16(v4bf16); + vcvt_bf16_f32(v4f32); +} + +void undefined(uint32x2_t v2i32, uint32x4_t v4i32, uint16x8_t v8i16, uint8x16_t v16i8, uint8x8_t v8i8, float32x2_t v2f32, float32x4_t v4f32, float16x4_t v4f16, bfloat16x4_t v4bf16, __bf16 bf16) { + // dotprod + vdot_u32(v2i32, v8i8, v8i8); // expected-error {{always_inline function 'vdot_u32' requires target feature 'dotprod'}} + vdot_laneq_u32(v2i32, v8i8, v16i8, 1); // expected-error {{always_inline function 'vdot_u32' requires target feature 'dotprod'}} + // fp16 + vceqz_f16(v4f16); // expected-error {{always_inline function 'vceqz_f16' requires target feature 'fullfp16'}} + vrnd_f16(v4f16); // expected-error {{always_inline function 'vrnd_f16' requires target feature 'fullfp16'}} + vmaxnm_f16(v4f16, v4f16); // expected-error {{always_inline function 'vmaxnm_f16' requires target feature 'fullfp16'}} + vrndi_f16(v4f16); // expected-error {{always_inline function 'vrndi_f16' requires target feature 'fullfp16'}} + // fp16fml + vfmlal_low_f16(v2f32, v4f16, v4f16); // expected-error {{always_inline function 'vfmlal_low_f16' requires target feature 'fp16fml'}} + // i8mm + vmmlaq_s32(v4i32, v8i16, v8i16); // expected-error {{always_inline function 'vmmlaq_s32' requires target feature 'i8mm'}} + vusdot_laneq_s32(v2i32, v8i8, v8i16, 0); // expected-error {{always_inline function 'vusdot_s32' requires target feature 'i8mm'}} + // bf16 + vbfdot_f32(v2f32, v4bf16, v4bf16); // expected-error {{always_inline function 'vbfdot_f32' requires target feature 'bf16'}} + vcreate_bf16(10); + vdup_lane_bf16(v4bf16, 2); // expected-error {{'__builtin_neon_splat_lane_bf16' needs target feature bf16}} + vdup_n_bf16(bf16); // expected-error {{always_inline function 'vdup_n_bf16' requires target feature 'bf16'}} + vld1_bf16(0); // expected-error {{'__builtin_neon_vld1_bf16' needs target feature bf16}} + vcvt_f32_bf16(v4bf16); // expected-error {{always_inline function 'vcvt_f32_bf16' requires target feature 'bf16'}} + vcvt_bf16_f32(v4f32); // expected-error {{always_inline function 'vcvt_bf16_f32' requires target feature 'bf16'}} +} Index: clang/test/Sema/arm-neon-target.c =================================================================== --- /dev/null +++ clang/test/Sema/arm-neon-target.c @@ -0,0 +1,53 @@ +// RUN: %clang_cc1 -triple armv8a-none-linux-gnu -target-feature +neon -fallow-half-arguments-and-returns -fsyntax-only -verify -emit-llvm -o - %s +// REQUIRES: arm-registered-target + +// Test that functions with the correct target attributes can use the correct NEON intrinsics. + +#include + +__attribute__((target("dotprod"))) +void dotprod(uint32x2_t v2i32, uint8x16_t v16i8, uint8x8_t v8i8) { + vdot_u32(v2i32, v8i8, v8i8); +} + +__attribute__((target("fullfp16"))) +void fp16(uint32x2_t v2i32, uint32x4_t v4i32, uint16x8_t v8i16, uint8x16_t v16i8, uint8x8_t v8i8, float32x2_t v2f32, float32x4_t v4f32, float16x4_t v4f16, bfloat16x4_t v4bf16) { + vceqz_f16(v4f16); + vrnd_f16(v4f16); + vmaxnm_f16(v4f16, v4f16); +} + +__attribute__((target("i8mm"))) +void i8mm(uint32x2_t v2i32, uint32x4_t v4i32, uint16x8_t v8i16, uint8x16_t v16i8, uint8x8_t v8i8, float32x2_t v2f32, float32x4_t v4f32, float16x4_t v4f16, bfloat16x4_t v4bf16) { + vmmlaq_s32(v4i32, v8i16, v8i16); +} + +__attribute__((target("bf16"))) +void bf16(uint32x2_t v2i32, uint32x4_t v4i32, uint16x8_t v8i16, uint8x16_t v16i8, uint8x8_t v8i8, float32x2_t v2f32, float32x4_t v4f32, float16x4_t v4f16, bfloat16x4_t v4bf16, __bf16 bf16) { + vbfdot_f32(v2f32, v4bf16, v4bf16); + vcreate_bf16(10); + vdup_lane_bf16(v4bf16, 2); + vdup_n_bf16(bf16); + vld1_bf16(0); + vcvt_f32_bf16(v4bf16); + vcvt_bf16_f32(v4f32); +} + +void undefined(uint32x2_t v2i32, uint32x4_t v4i32, uint16x8_t v8i16, uint8x16_t v16i8, uint8x8_t v8i8, float32x2_t v2f32, float32x4_t v4f32, float16x4_t v4f16, bfloat16x4_t v4bf16, __bf16 bf16) { + // dotprod + vdot_u32(v2i32, v8i8, v8i8); // expected-error {{always_inline function 'vdot_u32' requires target feature 'dotprod'}} + // fp16 + vceqz_f16(v4f16); // expected-error {{always_inline function 'vceqz_f16' requires target feature 'fullfp16'}} + vrnd_f16(v4f16); // expected-error {{always_inline function 'vrnd_f16' requires target feature 'fullfp16'}} + vmaxnm_f16(v4f16, v4f16); // expected-error {{always_inline function 'vmaxnm_f16' requires target feature 'fullfp16'}} + // i8mm + vmmlaq_s32(v4i32, v8i16, v8i16); // expected-error {{always_inline function 'vmmlaq_s32' requires target feature 'i8mm'}} + // bf16 + vbfdot_f32(v2f32, v4bf16, v4bf16); // expected-error {{always_inline function 'vbfdot_f32' requires target feature 'bf16'}} + vcreate_bf16(10); + vdup_lane_bf16(v4bf16, 2); // expected-error {{'__builtin_neon_splat_lane_bf16' needs target feature bf16}} + vdup_n_bf16(bf16); // expected-error {{always_inline function 'vdup_n_bf16' requires target feature 'bf16'}} + vld1_bf16(0); // expected-error {{'__builtin_neon_vld1_bf16' needs target feature bf16}} + vcvt_f32_bf16(v4bf16); // expected-error {{always_inline function 'vcvt_f32_bf16' requires target feature 'bf16'}} + vcvt_bf16_f32(v4f32); // expected-error {{always_inline function 'vcvt_bf16_f32' requires target feature 'bf16'}} +} Index: clang/utils/TableGen/NeonEmitter.cpp =================================================================== --- clang/utils/TableGen/NeonEmitter.cpp +++ clang/utils/TableGen/NeonEmitter.cpp @@ -321,8 +321,10 @@ /// The list of DAGs for the body. May be empty, in which case we should /// emit a builtin call. ListInit *Body; - /// The architectural #ifdef guard. - std::string Guard; + /// The architectural ifdef guard. + std::string ArchGuard; + /// The architectural target() guard. + std::string TargetGuard; /// Set if the Unavailable bit is 1. This means we don't generate a body, /// just an "unavailable" attribute on a declaration. bool IsUnavailable; @@ -368,9 +370,9 @@ public: Intrinsic(Record *R, StringRef Name, StringRef Proto, TypeSpec OutTS, TypeSpec InTS, ClassKind CK, ListInit *Body, NeonEmitter &Emitter, - StringRef Guard, bool IsUnavailable, bool BigEndianSafe) + StringRef ArchGuard, StringRef TargetGuard, bool IsUnavailable, bool BigEndianSafe) : R(R), Name(Name.str()), OutTS(OutTS), InTS(InTS), CK(CK), Body(Body), - Guard(Guard.str()), IsUnavailable(IsUnavailable), + ArchGuard(ArchGuard.str()), TargetGuard(TargetGuard.str()), IsUnavailable(IsUnavailable), BigEndianSafe(BigEndianSafe), PolymorphicKeyType(0), NeededEarly(false), UseMacro(false), BaseType(OutTS, "."), InBaseType(InTS, "."), Emitter(Emitter) { @@ -395,7 +397,11 @@ // Pointer arguments need to use macros to avoid hiding aligned attributes // from the pointer type. - if (Type.isImmediate() || Type.isPointer()) + + // It is not permitted to pass or return an __fp16 by value, so intrinsics + // taking a scalar float16_t must be implemented as macros. + if (Type.isImmediate() || Type.isPointer() || + (Type.isScalar() && Type.isHalf())) UseMacro = true; } } @@ -407,7 +413,8 @@ /// transitive closure. const std::set &getDependencies() const { return Dependencies; } /// Get the architectural guard string (#ifdef). - std::string getGuard() const { return Guard; } + std::string getArchGuard() const { return ArchGuard; } + std::string getTargetGuard() const { return TargetGuard; } /// Get the non-mangled name. std::string getName() const { return Name; } @@ -455,9 +462,11 @@ void setNeededEarly() { NeededEarly = true; } bool operator<(const Intrinsic &Other) const { - // Sort lexicographically on a two-tuple (Guard, Name) - if (Guard != Other.Guard) - return Guard < Other.Guard; + // Sort lexicographically on a three-tuple (ArchGuard, TargetGuard, Name) + if (ArchGuard != Other.ArchGuard) + return ArchGuard < Other.ArchGuard; + if (TargetGuard != Other.TargetGuard) + return TargetGuard < Other.TargetGuard; return Name < Other.Name; } @@ -947,7 +956,7 @@ char typeCode = '\0'; bool printNumber = true; - if (CK == ClassB) + if (CK == ClassB && TargetGuard == "") return ""; if (T.isBFloat16()) @@ -971,7 +980,7 @@ break; } } - if (CK == ClassB) { + if (CK == ClassB && TargetGuard == "") { typeCode = '\0'; } @@ -1073,7 +1082,7 @@ S += "_" + getInstTypeCode(InBaseType, LocalCK); } - if (LocalCK == ClassB) + if (LocalCK == ClassB && TargetGuard == "") S += "_v"; // Insert a 'q' before the first '_' character so that it ends up before @@ -1133,10 +1142,14 @@ } void Intrinsic::emitPrototype(StringRef NamePrefix) { - if (UseMacro) + if (UseMacro) { OS << "#define "; - else - OS << "__ai " << Types[0].str() << " "; + } else { + OS << "__ai "; + if (TargetGuard != "") + OS << "__attribute__((target(\"" << TargetGuard << "\"))) "; + OS << Types[0].str() << " "; + } OS << NamePrefix.str() << mangleName(Name, ClassS) << "("; @@ -1941,7 +1954,8 @@ std::string Types = std::string(R->getValueAsString("Types")); Record *OperationRec = R->getValueAsDef("Operation"); bool BigEndianSafe = R->getValueAsBit("BigEndianSafe"); - std::string Guard = std::string(R->getValueAsString("ArchGuard")); + std::string ArchGuard = std::string(R->getValueAsString("ArchGuard")); + std::string TargetGuard = std::string(R->getValueAsString("TargetGuard")); bool IsUnavailable = OperationRec->getValueAsBit("Unavailable"); std::string CartesianProductWith = std::string(R->getValueAsString("CartesianProductWith")); @@ -1983,7 +1997,7 @@ for (auto &I : NewTypeSpecs) { Entry.emplace_back(R, Name, Proto, I.first, I.second, CK, Body, *this, - Guard, IsUnavailable, BigEndianSafe); + ArchGuard, TargetGuard, IsUnavailable, BigEndianSafe); Out.push_back(&Entry.back()); } @@ -1998,22 +2012,31 @@ // We only want to emit a builtin once, and we want to emit them in // alphabetical order, so use a std::set. - std::set Builtins; + std::set> Builtins; for (auto *Def : Defs) { if (Def->hasBody()) continue; - std::string S = "BUILTIN(__builtin_neon_" + Def->getMangledName() + ", \""; - + std::string S = "__builtin_neon_" + Def->getMangledName() + ", \""; S += Def->getBuiltinTypeStr(); - S += "\", \"n\")"; + S += "\", \"n\""; - Builtins.insert(S); + Builtins.emplace(S, Def->getTargetGuard()); + } + + for (auto &S : Builtins) { + if (S.second == "") + OS << "BUILTIN("; + else + OS << "TARGET_BUILTIN("; + OS << S.first; + if (S.second == "") + OS << ")\n"; + else + OS << ", \"" << S.second << "\")\n"; } - for (auto &S : Builtins) - OS << S << "\n"; OS << "#endif\n\n"; } @@ -2330,10 +2353,8 @@ OS << "#include \n\n"; - OS << "#ifdef __ARM_FEATURE_BF16\n"; OS << "#include \n"; OS << "typedef __bf16 bfloat16_t;\n"; - OS << "#endif\n\n"; // Emit NEON-specific scalar typedefs. OS << "typedef float float32_t;\n"; @@ -2357,9 +2378,7 @@ emitNeonTypeDefs("cQcsQsiQilQlUcQUcUsQUsUiQUiUlQUlhQhfQfdQdPcQPcPsQPsPlQPl", OS); - OS << "#ifdef __ARM_FEATURE_BF16\n"; emitNeonTypeDefs("bQb", OS); - OS << "#endif\n\n"; OS << "#define __ai static __inline__ __attribute__((__always_inline__, " "__nodebug__))\n\n"; @@ -2395,10 +2414,10 @@ } // Emit #endif/#if pair if needed. - if ((*I)->getGuard() != InGuard) { + if ((*I)->getArchGuard() != InGuard) { if (!InGuard.empty()) OS << "#endif\n"; - InGuard = (*I)->getGuard(); + InGuard = (*I)->getArchGuard(); if (!InGuard.empty()) OS << "#if " << InGuard << "\n"; } @@ -2504,10 +2523,10 @@ } // Emit #endif/#if pair if needed. - if ((*I)->getGuard() != InGuard) { + if ((*I)->getArchGuard() != InGuard) { if (!InGuard.empty()) OS << "#endif\n"; - InGuard = (*I)->getGuard(); + InGuard = (*I)->getArchGuard(); if (!InGuard.empty()) OS << "#if " << InGuard << "\n"; } @@ -2581,10 +2600,10 @@ } // Emit #endif/#if pair if needed. - if ((*I)->getGuard() != InGuard) { + if ((*I)->getArchGuard() != InGuard) { if (!InGuard.empty()) OS << "#endif\n"; - InGuard = (*I)->getGuard(); + InGuard = (*I)->getArchGuard(); if (!InGuard.empty()) OS << "#if " << InGuard << "\n"; }