diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -8517,6 +8517,15 @@ (vector_extract (v8f16 FPR128:$Rn), (i64 1))), (f16 (FADDPv2i16p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>; +// Prefer using the bottom lanes of faddp Rn, Rn compared to +// faddp extractlow(Rn), extracthigh(Rn) +def : Pat<(AArch64faddp (v2f32 (extract_subvector (v4f32 FPR128:$Rn), (i64 0))), + (v2f32 (extract_subvector (v4f32 FPR128:$Rn), (i64 2)))), + (v2f32 (EXTRACT_SUBREG (FADDPv4f32 $Rn, $Rn), dsub))>; +def : Pat<(AArch64faddp (v4f16 (extract_subvector (v8f16 FPR128:$Rn), (i64 0))), + (v4f16 (extract_subvector (v8f16 FPR128:$Rn), (i64 4)))), + (v4f16 (EXTRACT_SUBREG (FADDPv8f16 $Rn, $Rn), dsub))>; + // Scalar 64-bit shifts in FPR64 registers. def : Pat<(i64 (int_aarch64_neon_sshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))), (SSHLv1i64 FPR64:$Rn, FPR64:$Rm)>; diff --git a/llvm/test/CodeGen/AArch64/faddp.ll b/llvm/test/CodeGen/AArch64/faddp.ll --- a/llvm/test/CodeGen/AArch64/faddp.ll +++ b/llvm/test/CodeGen/AArch64/faddp.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc --mtriple aarch64 < %s | FileCheck %s +; RUN: llc --mtriple aarch64 -mattr=+fullfp16 < %s | FileCheck %s define float @faddp_2xfloat(<2 x float> %a) { ; CHECK-LABEL: faddp_2xfloat: @@ -256,6 +256,39 @@ ret <16 x float> %b } +define float @faddp_v4f32(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: faddp_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s +; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s +; CHECK-NEXT: faddp s0, v0.2s +; CHECK-NEXT: ret + %1 = fadd <4 x float> %a, %b + %2 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> + %3 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> + %4 = tail call <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float> %2, <2 x float> %3) + %5 = shufflevector <2 x float> %4, <2 x float> poison, <2 x i32> + %6 = fadd <2 x float> %4, %5 + %7 = extractelement <2 x float> %6, i64 0 + ret float %7 +} + +define <4 x half> @faddp_v8f16(<8 x half> %a, <8 x half> %b) { +; CHECK-LABEL: faddp_v8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: fadd v0.8h, v0.8h, v1.8h +; CHECK-NEXT: faddp v0.8h, v0.8h, v0.8h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %1 = fadd <8 x half> %a, %b + %2 = shufflevector <8 x half> %1, <8 x half> poison, <4 x i32> + %3 = shufflevector <8 x half> %1, <8 x half> poison, <4 x i32> + %4 = tail call <4 x half> @llvm.aarch64.neon.faddp.v4f16(<4 x half> %2, <4 x half> %3) + ret <4 x half> %4 +} + +declare <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float>, <2 x float>) +declare <4 x half> @llvm.aarch64.neon.faddp.v4f16(<4 x half>, <4 x half>) attributes #0 = { strictfp }