diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -5345,19 +5345,22 @@ defm FMINNMP : SIMDFPPairwiseScalar<1, 0b01100, "fminnmp">; defm FMINP : SIMDFPPairwiseScalar<1, 0b01111, "fminp">; +// Only the lower half of the result of the inner FADDP is used in the patterns +// below, so the second operand does not matter. Re-use the first input +// operand, so no additional dependencies need to be introduced. let Predicates = [HasFullFP16] in { def : Pat<(f16 (vecreduce_fadd (v8f16 V128:$Rn))), (FADDPv2i16p (EXTRACT_SUBREG - (FADDPv8f16 (FADDPv8f16 V128:$Rn, (v8f16 (IMPLICIT_DEF))), (v8f16 (IMPLICIT_DEF))), + (FADDPv8f16 (FADDPv8f16 V128:$Rn, V128:$Rn), V128:$Rn), dsub))>; def : Pat<(f16 (vecreduce_fadd (v4f16 V64:$Rn))), - (FADDPv2i16p (FADDPv4f16 V64:$Rn, (v4f16 (IMPLICIT_DEF))))>; + (FADDPv2i16p (FADDPv4f16 V64:$Rn, V64:$Rn))>; } def : Pat<(f32 (vecreduce_fadd (v4f32 V128:$Rn))), (FADDPv2i32p (EXTRACT_SUBREG - (FADDPv4f32 V128:$Rn, (v4f32 (IMPLICIT_DEF))), + (FADDPv4f32 V128:$Rn, V128:$Rn), dsub))>; def : Pat<(f32 (vecreduce_fadd (v2f32 V64:$Rn))), (FADDPv2i32p V64:$Rn)>; diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll @@ -45,8 +45,8 @@ define half @add_H(<8 x half> %bin.rdx) { ; FULLFP16-LABEL: add_H: ; FULLFP16: // %bb.0: -; FULLFP16-NEXT: faddp v0.8h, v0.8h, v0.8h -; FULLFP16-NEXT: faddp v0.8h, v0.8h, v0.8h +; FULLFP16-NEXT: faddp v1.8h, v0.8h, v0.8h +; FULLFP16-NEXT: faddp v0.8h, v1.8h, v0.8h ; FULLFP16-NEXT: faddp h0, v0.2h ; FULLFP16-NEXT: ret ; @@ -115,8 +115,8 @@ ; FULLFP16-LABEL: add_2H: ; FULLFP16: // %bb.0: ; FULLFP16-NEXT: fadd v0.8h, v0.8h, v1.8h -; FULLFP16-NEXT: faddp v0.8h, v0.8h, v0.8h -; FULLFP16-NEXT: faddp v0.8h, v0.8h, v0.8h +; FULLFP16-NEXT: faddp v1.8h, v0.8h, v0.8h +; FULLFP16-NEXT: faddp v0.8h, v1.8h, v0.8h ; FULLFP16-NEXT: faddp h0, v0.2h ; FULLFP16-NEXT: ret ; @@ -248,7 +248,7 @@ ; CHECK-NEXT: ldr q1, [x0, x8] ; CHECK-NEXT: add x8, x8, #16 ; CHECK-NEXT: cmp w8, #112 -; CHECK-NEXT: faddp v1.4s, v1.4s, v0.4s +; CHECK-NEXT: faddp v1.4s, v1.4s, v1.4s ; CHECK-NEXT: faddp s1, v1.2s ; CHECK-NEXT: fadd s0, s1, s0 ; CHECK-NEXT: b.ne .LBB9_1 @@ -286,7 +286,7 @@ ; FULLFP16-NEXT: ldr d1, [x0, x8] ; FULLFP16-NEXT: add x8, x8, #8 ; FULLFP16-NEXT: cmp w8, #56 -; FULLFP16-NEXT: faddp v1.4h, v1.4h, v0.4h +; FULLFP16-NEXT: faddp v1.4h, v1.4h, v1.4h ; FULLFP16-NEXT: faddp h1, v1.2h ; FULLFP16-NEXT: fadd h0, h1, h0 ; FULLFP16-NEXT: b.ne .LBB10_1 @@ -357,8 +357,8 @@ ; FULLFP16-NEXT: ldr q1, [x0, x8] ; FULLFP16-NEXT: add x8, x8, #8 ; FULLFP16-NEXT: cmp w8, #56 -; FULLFP16-NEXT: faddp v1.8h, v1.8h, v0.8h -; FULLFP16-NEXT: faddp v1.8h, v1.8h, v0.8h +; FULLFP16-NEXT: faddp v2.8h, v1.8h, v1.8h +; FULLFP16-NEXT: faddp v1.8h, v2.8h, v1.8h ; FULLFP16-NEXT: faddp h1, v1.2h ; FULLFP16-NEXT: fadd h0, h1, h0 ; FULLFP16-NEXT: b.ne .LBB11_1