Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1071,6 +1071,9 @@ // ADDP custom lowering for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) setOperationAction(ISD::ADD, VT, Custom); + // FADDP custom lowering + for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 }) + setOperationAction(ISD::FADD, VT, Custom); } if (Subtarget->hasSVE()) { @@ -19300,9 +19303,13 @@ } static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl &Results, - SelectionDAG &DAG) { + SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { EVT VT = N->getValueType(0); - if (!VT.is256BitVector()) + if (!VT.is256BitVector() || + (VT.getScalarType().isFloatingPoint() && + !N->getFlags().hasAllowReassociation()) || + (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16())) return; SDValue X = N->getOperand(0); @@ -19520,7 +19527,8 @@ Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG)); return; case ISD::ADD: - ReplaceAddWithADDP(N, Results, DAG); + case ISD::FADD: + ReplaceAddWithADDP(N, Results, DAG, Subtarget); return; case ISD::CTPOP: Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -708,6 +708,9 @@ def AArch64saddlp : PatFrags<(ops node:$src), [(AArch64saddlp_n node:$src), (int_aarch64_neon_saddlp node:$src)]>; +def AArch64faddp : PatFrags<(ops node:$Rn, node:$Rm), + [(AArch64addp_n node:$Rn, node:$Rm), + (int_aarch64_neon_faddp node:$Rn, node:$Rm)]>; def SDT_AArch64SETTAG : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>; def AArch64stg : SDNode<"AArch64ISD::STG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; @@ -4533,7 +4536,7 @@ } defm FACGE : SIMDThreeSameVectorFPCmp<1,0,0b101,"facge",int_aarch64_neon_facge>; defm FACGT : SIMDThreeSameVectorFPCmp<1,1,0b101,"facgt",int_aarch64_neon_facgt>; -defm FADDP : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_faddp>; +defm FADDP : SIMDThreeSameVectorFP<1,0,0b010,"faddp", AArch64faddp>; defm FADD : SIMDThreeSameVectorFP<0,0,0b010,"fadd", any_fadd>; defm FCMEQ : SIMDThreeSameVectorFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>; defm FCMGE : SIMDThreeSameVectorFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>; Index: llvm/test/CodeGen/AArch64/faddp-half.ll =================================================================== --- llvm/test/CodeGen/AArch64/faddp-half.ll +++ llvm/test/CodeGen/AArch64/faddp-half.ll @@ -214,10 +214,9 @@ define <16 x half> @addp_v16f16(<16 x half> %a) { ; CHECK-LABEL: addp_v16f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: rev32 v2.8h, v0.8h -; CHECK-NEXT: rev32 v3.8h, v1.8h -; CHECK-NEXT: fadd v0.8h, v2.8h, v0.8h -; CHECK-NEXT: fadd v1.8h, v3.8h, v1.8h +; CHECK-NEXT: faddp v1.8h, v0.8h, v1.8h +; CHECK-NEXT: zip1 v0.8h, v1.8h, v1.8h +; CHECK-NEXT: zip2 v1.8h, v1.8h, v1.8h ; CHECK-NEXT: ret ; ; CHECKNOFP16-LABEL: addp_v16f16: Index: llvm/test/CodeGen/AArch64/faddp.ll =================================================================== --- llvm/test/CodeGen/AArch64/faddp.ll +++ llvm/test/CodeGen/AArch64/faddp.ll @@ -191,10 +191,9 @@ define <4 x double> @addp_v4f64(<4 x double> %a) { ; CHECK-LABEL: addp_v4f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: fadd v0.2d, v2.2d, v0.2d -; CHECK-NEXT: fadd v1.2d, v3.2d, v1.2d +; CHECK-NEXT: faddp v1.2d, v0.2d, v1.2d +; CHECK-NEXT: dup v0.2d, v1.d[0] +; CHECK-NEXT: dup v1.2d, v1.d[1] ; CHECK-NEXT: ret entry: %s = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> @@ -231,10 +230,9 @@ define <8 x float> @addp_v8f32_slow(<8 x float> %a) { ; CHECK-LABEL: addp_v8f32_slow: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: rev64 v2.4s, v0.4s -; CHECK-NEXT: rev64 v3.4s, v1.4s -; CHECK-NEXT: fadd v0.4s, v2.4s, v0.4s -; CHECK-NEXT: fadd v1.4s, v3.4s, v1.4s +; CHECK-NEXT: faddp v1.4s, v0.4s, v1.4s +; CHECK-NEXT: zip1 v0.4s, v1.4s, v1.4s +; CHECK-NEXT: zip2 v1.4s, v1.4s, v1.4s ; CHECK-NEXT: ret entry: %s = shufflevector <8 x float> %a, <8 x float> poison, <8 x i32> @@ -245,14 +243,12 @@ define <16 x float> @addp_v16f32(<16 x float> %a) { ; CHECK-LABEL: addp_v16f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: rev64 v4.4s, v0.4s -; CHECK-NEXT: rev64 v5.4s, v1.4s -; CHECK-NEXT: rev64 v6.4s, v2.4s -; CHECK-NEXT: rev64 v7.4s, v3.4s -; CHECK-NEXT: fadd v0.4s, v4.4s, v0.4s -; CHECK-NEXT: fadd v1.4s, v5.4s, v1.4s -; CHECK-NEXT: fadd v2.4s, v6.4s, v2.4s -; CHECK-NEXT: fadd v3.4s, v7.4s, v3.4s +; CHECK-NEXT: faddp v3.4s, v2.4s, v3.4s +; CHECK-NEXT: faddp v1.4s, v0.4s, v1.4s +; CHECK-NEXT: zip1 v2.4s, v3.4s, v3.4s +; CHECK-NEXT: zip1 v0.4s, v1.4s, v1.4s +; CHECK-NEXT: zip2 v1.4s, v1.4s, v1.4s +; CHECK-NEXT: zip2 v3.4s, v3.4s, v3.4s ; CHECK-NEXT: ret entry: %s = shufflevector <16 x float> %a, <16 x float> poison, <16 x i32>