Index: include/clang/Basic/arm_neon.td =================================================================== --- include/clang/Basic/arm_neon.td +++ include/clang/Basic/arm_neon.td @@ -206,6 +206,15 @@ : Op<(call "vdot", $p0, $p1, (bitcast $p1, (splat(bitcast "uint32x4_t", $p2), $p3)))>; +def OP_FMLAL_LN : Op<(call "vfmlal_low", $p0, $p1, + (dup_typed $p1, (call "vget_lane", $p2, $p3)))>; +def OP_FMLSL_LN : Op<(call "vfmlsl_low", $p0, $p1, + (dup_typed $p1, (call "vget_lane", $p2, $p3)))>; +def OP_FMLAL_LN_Hi : Op<(call "vfmlal_high", $p0, $p1, + (dup_typed $p1, (call "vget_lane", $p2, $p3)))>; +def OP_FMLSL_LN_Hi : Op<(call "vfmlsl_high", $p0, $p1, + (dup_typed $p1, (call "vget_lane", $p2, $p3)))>; + //===----------------------------------------------------------------------===// // Instructions //===----------------------------------------------------------------------===// @@ -1640,3 +1649,21 @@ // Variants indexing into a 128-bit vector are A64 only. def UDOT_LANEQ : SOpInst<"vdot_laneq", "dd89i", "iUiQiQUi", OP_DOT_LNQ>; } + +// v8.2-A FP16 fused multiply-add long instructions. +let ArchGuard = "defined(__ARM_FEATURE_FP16FML) && defined(__aarch64__)" in { + def VFMLAL_LOW : SInst<"vfmlal_low", "ffHH", "UiQUi">; + def VFMLSL_LOW : SInst<"vfmlsl_low", "ffHH", "UiQUi">; + def VFMLAL_HIGH : SInst<"vfmlal_high", "ffHH", "UiQUi">; + def VFMLSL_HIGH : SInst<"vfmlsl_high", "ffHH", "UiQUi">; + + def VFMLAL_LANE_LOW : SOpInst<"vfmlal_lane_low", "ffH0i", "UiQUi", OP_FMLAL_LN>; + def VFMLSL_LANE_LOW : SOpInst<"vfmlsl_lane_low", "ffH0i", "UiQUi", OP_FMLSL_LN>; + def VFMLAL_LANE_HIGH : SOpInst<"vfmlal_lane_high", "ffH0i", "UiQUi", OP_FMLAL_LN_Hi>; + def VFMLSL_LANE_HIGH : SOpInst<"vfmlsl_lane_high", "ffH0i", "UiQUi", OP_FMLSL_LN_Hi>; + + def VFMLAL_LANEQ_LOW : SOpInst<"vfmlal_laneq_low", "ffH1i", "UiQUi", OP_FMLAL_LN>; + def VFMLSL_LANEQ_LOW : SOpInst<"vfmlsl_laneq_low", "ffH1i", "UiQUi", OP_FMLSL_LN>; + def VFMLAL_LANEQ_HIGH : SOpInst<"vfmlal_laneq_high", "ffH1i", "UiQUi", OP_FMLAL_LN_Hi>; + def VFMLSL_LANEQ_HIGH : SOpInst<"vfmlsl_laneq_high", "ffH1i", "UiQUi", OP_FMLSL_LN_Hi>; +} Index: include/clang/Basic/arm_neon_incl.td =================================================================== --- include/clang/Basic/arm_neon_incl.td +++ include/clang/Basic/arm_neon_incl.td @@ -96,6 +96,11 @@ // example: (dup $p1) -> "(uint32x2_t) {__p1, __p1}" (assuming the base type // is uint32x2_t). def dup; +// dup_typed - Take a vector and a scalar argument, and create a new vector of +// the same type by duplicating the scalar value into all lanes. +// example: (dup_typed $p1, $p2) -> "(float16x4_t) {__p2, __p2, __p2, __p2}" +// (assuming __p1 is float16x4_t, and __p2 is a compatible scalar). +def dup_typed; // splat - Take a vector and a lane index, and return a vector of the same type // containing repeated instances of the source vector at the lane index. // example: (splat $p0, $p1) -> @@ -229,6 +234,8 @@ // f: float (int args) // F: double (int args) // H: half (int args) +// 0: half (int args), ignore 'Q' size modifier. +// 1: half (int args), force 'Q' size modifier. // d: default // g: default, ignore 'Q' size modifier. // j: default, force 'Q' size modifier. Index: lib/Basic/Targets/AArch64.h =================================================================== --- lib/Basic/Targets/AArch64.h +++ lib/Basic/Targets/AArch64.h @@ -34,6 +34,7 @@ unsigned Unaligned; unsigned HasFullFP16; unsigned HasDotProd; + unsigned HasFP16FML; llvm::AArch64::ArchKind ArchKind; static const Builtin::Info BuiltinInfo[]; Index: lib/Basic/Targets/AArch64.cpp =================================================================== --- lib/Basic/Targets/AArch64.cpp +++ lib/Basic/Targets/AArch64.cpp @@ -194,6 +194,9 @@ if (HasDotProd) Builder.defineMacro("__ARM_FEATURE_DOTPROD", "1"); + if ((FPU & NeonMode) && HasFP16FML) + Builder.defineMacro("__ARM_FEATURE_FP16FML", "1"); + switch (ArchKind) { default: break; @@ -231,6 +234,7 @@ Unaligned = 1; HasFullFP16 = 0; HasDotProd = 0; + HasFP16FML = 0; ArchKind = llvm::AArch64::ArchKind::ARMV8A; for (const auto &Feature : Features) { @@ -252,6 +256,8 @@ HasFullFP16 = 1; if (Feature == "+dotprod") HasDotProd = 1; + if (Feature == "+fp16fml") + HasFP16FML = 1; } setDataLayout(); Index: lib/CodeGen/CGBuiltin.cpp =================================================================== --- lib/CodeGen/CGBuiltin.cpp +++ lib/CodeGen/CGBuiltin.cpp @@ -4368,6 +4368,14 @@ NEONMAP0(vextq_v), NEONMAP0(vfma_v), NEONMAP0(vfmaq_v), + NEONMAP1(vfmlal_high_v, aarch64_neon_fmlal2, 0), + NEONMAP1(vfmlal_low_v, aarch64_neon_fmlal, 0), + NEONMAP1(vfmlalq_high_v, aarch64_neon_fmlal2, 0), + NEONMAP1(vfmlalq_low_v, aarch64_neon_fmlal, 0), + NEONMAP1(vfmlsl_high_v, aarch64_neon_fmlsl2, 0), + NEONMAP1(vfmlsl_low_v, aarch64_neon_fmlsl, 0), + NEONMAP1(vfmlslq_high_v, aarch64_neon_fmlsl2, 0), + NEONMAP1(vfmlslq_low_v, aarch64_neon_fmlsl, 0), NEONMAP2(vhadd_v, aarch64_neon_uhadd, aarch64_neon_shadd, Add1ArgType | UnsignedAlts), NEONMAP2(vhaddq_v, aarch64_neon_uhadd, aarch64_neon_shadd, Add1ArgType | UnsignedAlts), NEONMAP2(vhsub_v, aarch64_neon_uhsub, aarch64_neon_shsub, Add1ArgType | UnsignedAlts), @@ -5341,6 +5349,34 @@ Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic; return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vdot"); } + case NEON::BI__builtin_neon_vfmlal_low_v: + case NEON::BI__builtin_neon_vfmlalq_low_v: { + llvm::Type *InputTy = + llvm::VectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16); + llvm::Type *Tys[2] = { Ty, InputTy }; + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlal_low"); + } + case NEON::BI__builtin_neon_vfmlsl_low_v: + case NEON::BI__builtin_neon_vfmlslq_low_v: { + llvm::Type *InputTy = + llvm::VectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16); + llvm::Type *Tys[2] = { Ty, InputTy }; + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlsl_low"); + } + case NEON::BI__builtin_neon_vfmlal_high_v: + case NEON::BI__builtin_neon_vfmlalq_high_v: { + llvm::Type *InputTy = + llvm::VectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16); + llvm::Type *Tys[2] = { Ty, InputTy }; + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlal_high"); + } + case NEON::BI__builtin_neon_vfmlsl_high_v: + case NEON::BI__builtin_neon_vfmlslq_high_v: { + llvm::Type *InputTy = + llvm::VectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16); + llvm::Type *Tys[2] = { Ty, InputTy }; + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlsl_high"); + } } assert(Int && "Expected valid intrinsic number"); Index: test/CodeGen/aarch64-neon-fp16fml.c =================================================================== --- /dev/null +++ test/CodeGen/aarch64-neon-fp16fml.c @@ -0,0 +1,196 @@ +// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +v8.2a -target-feature +neon -target-feature +fp16fml \ +// RUN: -fallow-half-arguments-and-returns -disable-O0-optnone -emit-llvm -o - %s | opt -S -instcombine | FileCheck %s + +// REQUIRES: aarch64-registered-target + +// Test AArch64 Armv8.2-A FP16 Fused Multiply-Add Long intrinsics + +#include + +// Vector form + +float32x2_t test_vfmlal_low_u32(float32x2_t a, float16x4_t b, float16x4_t c) { +// CHECK-LABEL: define <2 x float> @test_vfmlal_low_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c) +// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c) +// CHECK: ret <2 x float> [[RESULT]] + return vfmlal_low_u32(a, b, c); +} + +float32x2_t test_vfmlsl_low_u32(float32x2_t a, float16x4_t b, float16x4_t c) { +// CHECK-LABEL: define <2 x float> @test_vfmlsl_low_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c) +// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c) +// CHECK: ret <2 x float> [[RESULT]] + return vfmlsl_low_u32(a, b, c); +} + +float32x2_t test_vfmlal_high_u32(float32x2_t a, float16x4_t b, float16x4_t c) { +// CHECK-LABEL: define <2 x float> @test_vfmlal_high_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c) +// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c) +// CHECK: ret <2 x float> [[RESULT]] + return vfmlal_high_u32(a, b, c); +} + +float32x2_t test_vfmlsl_high_u32(float32x2_t a, float16x4_t b, float16x4_t c) { +// CHECK-LABEL: define <2 x float> @test_vfmlsl_high_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c) +// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c) +// CHECK: ret <2 x float> [[RESULT]] + return vfmlsl_high_u32(a, b, c); +} + +float32x4_t test_vfmlalq_low_u32(float32x4_t a, float16x8_t b, float16x8_t c) { +// CHECK-LABEL: define <4 x float> @test_vfmlalq_low_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c) +// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c) +// CHECK: ret <4 x float> [[RESULT]] + return vfmlalq_low_u32(a, b, c); +} + +float32x4_t test_vfmlslq_low_u32(float32x4_t a, float16x8_t b, float16x8_t c) { +// CHECK-LABEL: define <4 x float> @test_vfmlslq_low_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c) +// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c) +// CHECK: ret <4 x float> [[RESULT]] + return vfmlslq_low_u32(a, b, c); +} + +float32x4_t test_vfmlalq_high_u32(float32x4_t a, float16x8_t b, float16x8_t c) { +// CHECK-LABEL: define <4 x float> @test_vfmlalq_high_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c) +// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c) +// CHECK: ret <4 x float> [[RESULT]] + return vfmlalq_high_u32(a, b, c); +} + +float32x4_t test_vfmlslq_high_u32(float32x4_t a, float16x8_t b, float16x8_t c) { +// CHECK-LABEL: define <4 x float> @test_vfmlslq_high_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c) +// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c) +// CHECK: ret <4 x float> [[RESULT]] + return vfmlslq_high_u32(a, b, c); +} + +// Indexed form + +float32x2_t test_vfmlal_lane_low_u32(float32x2_t a, float16x4_t b, float16x4_t c) { +// CHECK-LABEL: define <2 x float> @test_vfmlal_lane_low_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c) +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> zeroinitializer +// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]]) +// CHECK: ret <2 x float> [[RESULT]] + return vfmlal_lane_low_u32(a, b, c, 0); +} + +float32x2_t test_vfmlal_lane_high_u32(float32x2_t a, float16x4_t b, float16x4_t c) { +// CHECK-LABEL: define <2 x float> @test_vfmlal_lane_high_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c) +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> +// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]]) +// CHECK: ret <2 x float> [[RESULT]] + return vfmlal_lane_high_u32(a, b, c, 1); +} + +float32x4_t test_vfmlalq_lane_low_u32(float32x4_t a, float16x8_t b, float16x4_t c) { +// CHECK-LABEL: define <4 x float> @test_vfmlalq_lane_low_u32(<4 x float> %a, <8 x half> %b, <4 x half> %c) +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> +// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]]) +// CHECK: ret <4 x float> [[RESULT]] + return vfmlalq_lane_low_u32(a, b, c, 2); +} + +float32x4_t test_vfmlalq_lane_high_u32(float32x4_t a, float16x8_t b, float16x4_t c) { +// CHECK-LABEL: define <4 x float> @test_vfmlalq_lane_high_u32(<4 x float> %a, <8 x half> %b, <4 x half> %c) +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> +// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]]) +// CHECK: ret <4 x float> [[RESULT]] + return vfmlalq_lane_high_u32(a, b, c, 3); +} + +float32x2_t test_vfmlal_laneq_low_u32(float32x2_t a, float16x4_t b, float16x8_t c) { +// CHECK-LABEL: define <2 x float> @test_vfmlal_laneq_low_u32(<2 x float> %a, <4 x half> %b, <8 x half> %c) +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <4 x i32> +// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]]) +// CHECK: ret <2 x float> [[RESULT]] + return vfmlal_laneq_low_u32(a, b, c, 4); +} + +float32x2_t test_vfmlal_laneq_high_u32(float32x2_t a, float16x4_t b, float16x8_t c) { +// CHECK-LABEL: define <2 x float> @test_vfmlal_laneq_high_u32(<2 x float> %a, <4 x half> %b, <8 x half> %c) +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <4 x i32> +// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]]) +// CHECK: ret <2 x float> [[RESULT]] + return vfmlal_laneq_high_u32(a, b, c, 5); +} + +float32x4_t test_vfmlalq_laneq_low_u32(float32x4_t a, float16x8_t b, float16x8_t c) { +// CHECK-LABEL: define <4 x float> @test_vfmlalq_laneq_low_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c) +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> +// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]]) +// CHECK: ret <4 x float> [[RESULT]] + return vfmlalq_laneq_low_u32(a, b, c, 6); +} + +float32x4_t test_vfmlalq_laneq_high_u32(float32x4_t a, float16x8_t b, float16x8_t c) { +// CHECK-LABEL: define <4 x float> @test_vfmlalq_laneq_high_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c) +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> +// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]]) +// CHECK: ret <4 x float> [[RESULT]] + return vfmlalq_laneq_high_u32(a, b, c, 7); +} + +float32x2_t test_vfmlsl_lane_low_u32(float32x2_t a, float16x4_t b, float16x4_t c) { +// CHECK-LABEL: define <2 x float> @test_vfmlsl_lane_low_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c) +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> zeroinitializer +// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]]) +// CHECK: ret <2 x float> [[RESULT]] + return vfmlsl_lane_low_u32(a, b, c, 0); +} + +float32x2_t test_vfmlsl_lane_high_u32(float32x2_t a, float16x4_t b, float16x4_t c) { +// CHECK-LABEL: define <2 x float> @test_vfmlsl_lane_high_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c) +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> +// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]]) +// CHECK: ret <2 x float> [[RESULT]] + return vfmlsl_lane_high_u32(a, b, c, 1); +} + +float32x4_t test_vfmlslq_lane_low_u32(float32x4_t a, float16x8_t b, float16x4_t c) { +// CHECK-LABEL: define <4 x float> @test_vfmlslq_lane_low_u32(<4 x float> %a, <8 x half> %b, <4 x half> %c) +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> +// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]]) +// CHECK: ret <4 x float> [[RESULT]] + return vfmlslq_lane_low_u32(a, b, c, 2); +} + +float32x4_t test_vfmlslq_lane_high_u32(float32x4_t a, float16x8_t b, float16x4_t c) { +// CHECK-LABEL: define <4 x float> @test_vfmlslq_lane_high_u32(<4 x float> %a, <8 x half> %b, <4 x half> %c) +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> +// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]]) +// CHECK: ret <4 x float> [[RESULT]] + return vfmlslq_lane_high_u32(a, b, c, 3); +} + +float32x2_t test_vfmlsl_laneq_low_u32(float32x2_t a, float16x4_t b, float16x8_t c) { +// CHECK-LABEL: define <2 x float> @test_vfmlsl_laneq_low_u32(<2 x float> %a, <4 x half> %b, <8 x half> %c) +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <4 x i32> +// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]]) +// CHECK: ret <2 x float> [[RESULT]] + return vfmlsl_laneq_low_u32(a, b, c, 4); +} + +float32x2_t test_vfmlsl_laneq_high_u32(float32x2_t a, float16x4_t b, float16x8_t c) { +// CHECK-LABEL: define <2 x float> @test_vfmlsl_laneq_high_u32(<2 x float> %a, <4 x half> %b, <8 x half> %c) +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <4 x i32> +// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]]) +// CHECK: ret <2 x float> [[RESULT]] + return vfmlsl_laneq_high_u32(a, b, c, 5); +} + +float32x4_t test_vfmlslq_laneq_low_u32(float32x4_t a, float16x8_t b, float16x8_t c) { +// CHECK-LABEL: define <4 x float> @test_vfmlslq_laneq_low_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c) +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> +// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]]) +// CHECK: ret <4 x float> [[RESULT]] + return vfmlslq_laneq_low_u32(a, b, c, 6); +} + +float32x4_t test_vfmlslq_laneq_high_u32(float32x4_t a, float16x8_t b, float16x8_t c) { +// CHECK-LABEL: define <4 x float> @test_vfmlslq_laneq_high_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c) +// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> +// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]]) +// CHECK: ret <4 x float> [[RESULT]] + return vfmlslq_laneq_high_u32(a, b, c, 7); +} Index: test/Preprocessor/aarch64-target-features.c =================================================================== --- test/Preprocessor/aarch64-target-features.c +++ test/Preprocessor/aarch64-target-features.c @@ -93,16 +93,20 @@ // RUN: %clang -target aarch64-none-linux-gnu -march=armv8.2a+dotprod -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-DOTPROD %s // CHECK-DOTPROD: __ARM_FEATURE_DOTPROD 1 -// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+nofp16fml+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s -// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+nofp16+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s -// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+fp16+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s -// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8-a+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s -// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8-a+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s -// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+nofp16fml+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s -// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+nofp16+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s -// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s -// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s -// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s +// On ARMv8.2-A and above, +fp16fml implies +fp16. +// On ARMv8.4-A and above, +fp16 implies +fp16fml. +// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+nofp16fml+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s +// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+nofp16+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s +// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+fp16+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s +// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8-a+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s +// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8-a+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s +// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+nofp16fml+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s +// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+nofp16+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s +// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s +// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s +// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s +// CHECK-FULLFP16-FML: #define __ARM_FEATURE_FP16FML 1 +// CHECK-FULLFP16-NOFML-NOT: #define __ARM_FEATURE_FP16FML 1 // CHECK-FULLFP16-VECTOR-SCALAR: #define __ARM_FEATURE_FP16_SCALAR_ARITHMETIC 1 // CHECK-FULLFP16-VECTOR-SCALAR: #define __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1 // CHECK-FULLFP16-VECTOR-SCALAR: #define __ARM_FP 0xE @@ -114,6 +118,7 @@ // RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8-a+fp16+nosimd -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-SCALAR %s // RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16fml+nosimd -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-SCALAR %s // RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16+nosimd -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-SCALAR %s +// CHECK-FULLFP16-SCALAR-NOT: #define __ARM_FEATURE_FP16FML 1 // CHECK-FULLFP16-SCALAR: #define __ARM_FEATURE_FP16_SCALAR_ARITHMETIC 1 // CHECK-FULLFP16-SCALAR-NOT: #define __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1 // CHECK-FULLFP16-SCALAR: #define __ARM_FP 0xE @@ -127,10 +132,11 @@ // RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+nofp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s // RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s // RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16fml+nofp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s +// CHECK-FULLFP16-NOFML-VECTOR-SCALAR-NOT: #define __ARM_FEATURE_FP16FML 1 // CHECK-FULLFP16-NOFML-VECTOR-SCALAR-NOT: #define __ARM_FEATURE_FP16_SCALAR_ARITHMETIC 1 // CHECK-FULLFP16-NOFML-VECTOR-SCALAR-NOT: #define __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1 -// CHECK-FULLFP16-NOFML-VECTOR-SCALAR: #define __ARM_FP 0xE -// CHECK-FULLFP16-NOFML-VECTOR-SCALAR: #define __ARM_FP16_FORMAT_IEEE 1 +// CHECK-FULLFP16-NOFML-VECTOR-SCALAR: #define __ARM_FP 0xE +// CHECK-FULLFP16-NOFML-VECTOR-SCALAR: #define __ARM_FP16_FORMAT_IEEE 1 // ================== Check whether -mtune accepts mixed-case features. // RUN: %clang -target aarch64 -mtune=CYCLONE -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MTUNE-CYCLONE %s Index: utils/TableGen/NeonEmitter.cpp =================================================================== --- utils/TableGen/NeonEmitter.cpp +++ utils/TableGen/NeonEmitter.cpp @@ -494,6 +494,7 @@ std::pair emitDagSaveTemp(DagInit *DI); std::pair emitDagSplat(DagInit *DI); std::pair emitDagDup(DagInit *DI); + std::pair emitDagDupTyped(DagInit *DI); std::pair emitDagShuffle(DagInit *DI); std::pair emitDagCast(DagInit *DI, bool IsBitCast); std::pair emitDagCall(DagInit *DI); @@ -897,6 +898,18 @@ Float = true; ElementBitwidth = 16; break; + case '0': + Float = true; + if (AppliedQuad) + Bitwidth /= 2; + ElementBitwidth = 16; + break; + case '1': + Float = true; + if (!AppliedQuad) + Bitwidth *= 2; + ElementBitwidth = 16; + break; case 'g': if (AppliedQuad) Bitwidth /= 2; @@ -1507,6 +1520,8 @@ return emitDagShuffle(DI); if (Op == "dup") return emitDagDup(DI); + if (Op == "dup_typed") + return emitDagDupTyped(DI); if (Op == "splat") return emitDagSplat(DI); if (Op == "save_temp") @@ -1771,6 +1786,28 @@ return std::make_pair(T, S); } +std::pair Intrinsic::DagEmitter::emitDagDupTyped(DagInit *DI) { + assert_with_loc(DI->getNumArgs() == 2, "dup_typed() expects two arguments"); + std::pair A = emitDagArg(DI->getArg(0), + DI->getArgNameStr(0)); + std::pair B = emitDagArg(DI->getArg(1), + DI->getArgNameStr(1)); + assert_with_loc(B.first.isScalar(), + "dup_typed() requires a scalar as the second argument"); + + Type T = A.first; + assert_with_loc(T.isVector(), "dup_typed() used but target type is scalar!"); + std::string S = "(" + T.str() + ") {"; + for (unsigned I = 0; I < T.getNumElements(); ++I) { + if (I != 0) + S += ", "; + S += B.second; + } + S += "}"; + + return std::make_pair(T, S); +} + std::pair Intrinsic::DagEmitter::emitDagSplat(DagInit *DI) { assert_with_loc(DI->getNumArgs() == 2, "splat() expects two arguments"); std::pair A = emitDagArg(DI->getArg(0),