diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -1908,16 +1908,41 @@ def VCADD_ROT270_FP16 : SInst<"vcadd_rot270", "...", "h">; def VCADDQ_ROT90_FP16 : SInst<"vcaddq_rot90", "QQQ", "h">; def VCADDQ_ROT270_FP16 : SInst<"vcaddq_rot270", "QQQ", "h">; + + def VCMLA_FP16 : SInst<"vcmla", "....", "h">; + def VCMLA_ROT90_FP16 : SInst<"vcmla_rot90", "....", "h">; + def VCMLA_ROT180_FP16 : SInst<"vcmla_rot180", "....", "h">; + def VCMLA_ROT270_FP16 : SInst<"vcmla_rot270", "....", "h">; + def VCMLAQ_FP16 : SInst<"vcmlaq", "QQQQ", "h">; + def VCMLAQ_ROT90_FP16 : SInst<"vcmlaq_rot90", "QQQQ", "h">; + def VCMLAQ_ROT180_FP16 : SInst<"vcmlaq_rot180", "QQQQ", "h">; + def VCMLAQ_ROT270_FP16 : SInst<"vcmlaq_rot270", "QQQQ", "h">; } let ArchGuard = "defined(__ARM_FEATURE_COMPLEX)" in { def VCADD_ROT90 : SInst<"vcadd_rot90", "...", "f">; def VCADD_ROT270 : SInst<"vcadd_rot270", "...", "f">; def VCADDQ_ROT90 : SInst<"vcaddq_rot90", "QQQ", "f">; def VCADDQ_ROT270 : SInst<"vcaddq_rot270", "QQQ", "f">; + + def VCMLA : SInst<"vcmla", "....", "f">; + def VCMLA_ROT90 : SInst<"vcmla_rot90", "....", "f">; + def VCMLA_ROT180 : SInst<"vcmla_rot180", "....", "f">; + def VCMLA_ROT270 : SInst<"vcmla_rot270", "....", "f">; + + def VCMLAQ : SInst<"vcmlaq", "QQQQ", "f">; + def VCMLAQ_ROT90 : SInst<"vcmlaq_rot90", "QQQQ", "f">; + def VCMLAQ_ROT180 : SInst<"vcmlaq_rot180", "QQQQ", "f">; + def VCMLAQ_ROT270 : SInst<"vcmlaq_rot270", "QQQQ", "f">; + } let ArchGuard = "defined(__ARM_FEATURE_COMPLEX) && defined(__aarch64__)" in { def VCADDQ_ROT90_FP64 : SInst<"vcaddq_rot90", "QQQ", "d">; def VCADDQ_ROT270_FP64 : SInst<"vcaddq_rot270", "QQQ", "d">; + + def VCMLAQ_FP64 : SInst<"vcmlaq", "QQQQ", "d">; + def VCMLAQ_ROT90_FP64 : SInst<"vcmlaq_rot90", "QQQQ", "d">; + def VCMLAQ_ROT180_FP64 : SInst<"vcmlaq_rot180", "QQQQ", "d">; + def VCMLAQ_ROT270_FP64 : SInst<"vcmlaq_rot270", "QQQQ", "d">; } // V8.2-A BFloat intrinsics diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -5234,6 +5234,7 @@ NEONMAP1(vcadd_rot90_v, arm_neon_vcadd_rot90, Add1ArgType), NEONMAP1(vcaddq_rot270_v, arm_neon_vcadd_rot270, Add1ArgType), NEONMAP1(vcaddq_rot90_v, arm_neon_vcadd_rot90, Add1ArgType), + NEONMAP1(vcaddq_rot90_v, arm_neon_vcadd_rot90, Add1ArgType), NEONMAP1(vcage_v, arm_neon_vacge, 0), NEONMAP1(vcageq_v, arm_neon_vacge, 0), NEONMAP1(vcagt_v, arm_neon_vacgt, 0), @@ -5548,6 +5549,14 @@ NEONMAP0(vcltzq_v), NEONMAP1(vclz_v, ctlz, Add1ArgType), NEONMAP1(vclzq_v, ctlz, Add1ArgType), + NEONMAP1(vcmla_rot180_v, aarch64_neon_vcmla_rot180, Add1ArgType), + NEONMAP1(vcmla_rot270_v, aarch64_neon_vcmla_rot270, Add1ArgType), + NEONMAP1(vcmla_rot90_v, aarch64_neon_vcmla_rot90, Add1ArgType), + NEONMAP1(vcmla_v, aarch64_neon_vcmla, Add1ArgType), + NEONMAP1(vcmlaq_rot180_v, aarch64_neon_vcmla_rot180, Add1ArgType), + NEONMAP1(vcmlaq_rot270_v, aarch64_neon_vcmla_rot270, Add1ArgType), + NEONMAP1(vcmlaq_rot90_v, aarch64_neon_vcmla_rot90, Add1ArgType), + NEONMAP1(vcmlaq_v, aarch64_neon_vcmla, Add1ArgType), NEONMAP1(vcnt_v, ctpop, Add1ArgType), NEONMAP1(vcntq_v, ctpop, Add1ArgType), NEONMAP1(vcvt_f16_f32, aarch64_neon_vcvtfp2hf, 0), diff --git a/clang/test/CodeGen/aarch64-neon-vcmla.c b/clang/test/CodeGen/aarch64-neon-vcmla.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-neon-vcmla.c @@ -0,0 +1,105 @@ +// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon \ +// RUN: -target-feature +v8.3a -target-feature +fullfp16 -S -emit-llvm -o - %s \ +// RUN: | FileCheck %s + +#include + +void foo_16x4(float16x4_t a, float16x4_t b, float16x4_t c) { + // CHECK: call <4 x half> @llvm.aarch64.neon.vcmla.v4f16 + float16x4_t result = vcmla_f16(a, b, c); +} + +void foo_rot90_16x4(float16x4_t a, float16x4_t b, float16x4_t c) { + // CHECK: call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16 + float16x4_t result = vcmla_rot90_f16(a, b, c); +} + +void foo_rot180_16x4(float16x4_t a, float16x4_t b, float16x4_t c) { + // CHECK: call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16 + float16x4_t result = vcmla_rot180_f16(a, b, c); +} + +void foo_rot270_16x4(float16x4_t a, float16x4_t b, float16x4_t c) { + // CHECK: call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16 + float16x4_t result = vcmla_rot270_f16(a, b, c); +} + +void foo_16x8(float16x8_t a, float16x8_t b, float16x8_t c) { + // CHECK: call <8 x half> @llvm.aarch64.neon.vcmla.v8f16 + float16x8_t result = vcmlaq_f16(a, b, c); +} + +void foo_rot90_16x8(float16x8_t a, float16x8_t b, float16x8_t c) { + // CHECK: call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16 + float16x8_t result = vcmlaq_rot90_f16(a, b, c); +} + +void foo_rot180_16x8(float16x8_t a, float16x8_t b, float16x8_t c) { + // CHECK: call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16 + float16x8_t result = vcmlaq_rot180_f16(a, b, c); +} + +void foo_rot270_16x8(float16x8_t a, float16x8_t b, float16x8_t c) { + // CHECK: call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16 + float16x8_t result = vcmlaq_rot270_f16(a, b, c); +} + +void foo_32x2(float32x2_t a, float32x2_t b, float32x2_t c) { + // CHECK: call <2 x float> @llvm.aarch64.neon.vcmla.v2f32 + float32x2_t result = vcmla_f32(a, b, c); +} + +void foo_rot90_32x2(float32x2_t a, float32x2_t b, float32x2_t c) { + // CHECK: call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32 + float32x2_t result = vcmla_rot90_f32(a, b, c); +} + +void foo_rot180_32x2(float32x2_t a, float32x2_t b, float32x2_t c) { + // CHECK: call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32 + float32x2_t result = vcmla_rot180_f32(a, b, c); +} + +void foo_rot270_32x2(float32x2_t a, float32x2_t b, float32x2_t c) { + // CHECK: call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32 + float32x2_t result = vcmla_rot270_f32(a, b, c); +} + +void foo_32x4(float32x4_t a, float32x4_t b, float32x4_t c) { + // CHECK: call <4 x float> @llvm.aarch64.neon.vcmla.v4f32 + float32x4_t result = vcmlaq_f32(a, b, c); +} + +void foo_rot90_32x4(float32x4_t a, float32x4_t b, float32x4_t c) { + // CHECK: call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32 + float32x4_t result = vcmlaq_rot90_f32(a, b, c); +} + +void foo_rot180_32x4(float32x4_t a, float32x4_t b, float32x4_t c) { + // CHECK: call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32 + float32x4_t result = vcmlaq_rot180_f32(a, b, c); +} + +void foo_rot270_32x4(float32x4_t a, float32x4_t b, float32x4_t c) { + // CHECK: call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32 + float32x4_t result = vcmlaq_rot270_f32(a, b, c); +} + +void foo_64x2(float64x2_t a, float64x2_t b, float64x2_t c) { + // CHECK: call <2 x double> @llvm.aarch64.neon.vcmla.v2f64 + float64x2_t result = vcmlaq_f64(a, b, c); +} + +void foo_rot90_64x2(float64x2_t a, float64x2_t b, float64x2_t c) { + // CHECK: call <2 x double> @llvm.aarch64.neon.vcmla.rot90.v2f64 + float64x2_t result = vcmlaq_rot90_f64(a, b, c); +} + +void foo_rot180_64x2(float64x2_t a, float64x2_t b, float64x2_t c) { + // CHECK: call <2 x double> @llvm.aarch64.neon.vcmla.rot180.v2f64 + float64x2_t result = vcmlaq_rot180_f64(a, b, c); +} + +void foo_rot270_64x2(float64x2_t a, float64x2_t b, float64x2_t c) { + // CHECK: call <2 x double> @llvm.aarch64.neon.vcmla.rot270.v2f64 + float64x2_t result = vcmlaq_rot270_f64(a, b, c); +}