Index: llvm/lib/Target/ARM/ARMISelLowering.h =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.h +++ llvm/lib/Target/ARM/ARMISelLowering.h @@ -736,16 +736,7 @@ SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl &Created) const override; - /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster - /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be - /// expanded to FMAs when this method returns true, otherwise fmuladd is - /// expanded to fmul + fadd. - /// - /// ARM supports both fused and unfused multiply-add operations; we already - /// lower a pair of fmul and fadd to the latter so it's not clear that there - /// would be a gain or that the gain would be worthwhile enough to risk - /// correctness bugs. - bool isFMAFasterThanFMulAndFAdd(EVT VT) const override { return false; } + bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const; Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -15005,6 +15005,36 @@ return -1; } +/// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster +/// than a pair of fmul and fadd instructions. fmuladd intrinsics will be +/// expanded to FMAs when this method returns true, otherwise fmuladd is +/// expanded to fmul + fadd. +/// +/// ARM supports both fused and unfused multiply-add operations; we already +/// lower a pair of fmul and fadd to the latter so it's not clear that there +/// would be a gain or that the gain would be worthwhile enough to risk +/// correctness bugs. +/// +/// For MVE, we set this to true as it helps simplify the need for some +/// patterns (and we don't have the non-fused floating point instruction). +bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { + if (!Subtarget->hasMVEFloatOps()) + return false; + + if (!VT.isSimple()) + return false; + + switch (VT.getSimpleVT().SimpleTy) { + case MVT::v4f32: + case MVT::v8f16: + return true; + default: + break; + } + + return false; +} + static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { if (V < 0) return false; Index: llvm/lib/Target/ARM/ARMInstrMVE.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrMVE.td +++ llvm/lib/Target/ARM/ARMInstrMVE.td @@ -2786,31 +2786,15 @@ def MVE_VFMSf16 : MVE_VADDSUBFMA_fp<"vfms", "f16", 0b1, 0b1, 0b0, 0b1, v8f16, (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">; -let Predicates = [HasMVEFloat, UseFusedMAC] in { - def : Pat<(v8f16 (fadd (v8f16 MQPR:$src1), - (fmul (v8f16 MQPR:$src2), - (v8f16 MQPR:$src3)))), - (v8f16 (MVE_VFMAf16 $src1, $src2, $src3))>; - def : Pat<(v4f32 (fadd (v4f32 MQPR:$src1), - (fmul (v4f32 MQPR:$src2), - (v4f32 MQPR:$src3)))), - (v4f32 (MVE_VFMAf32 $src1, $src2, $src3))>; - - def : Pat<(v8f16 (fsub (v8f16 MQPR:$src1), - (fmul (v8f16 MQPR:$src2), - (v8f16 MQPR:$src3)))), - (v8f16 (MVE_VFMSf16 $src1, $src2, $src3))>; - def : Pat<(v4f32 (fsub (v4f32 MQPR:$src1), - (fmul (v4f32 MQPR:$src2), - (v4f32 MQPR:$src3)))), - (v4f32 (MVE_VFMSf32 $src1, $src2, $src3))>; -} - let Predicates = [HasMVEFloat] in { def : Pat<(v8f16 (fma (v8f16 MQPR:$src1), (v8f16 MQPR:$src2), (v8f16 MQPR:$src3))), (v8f16 (MVE_VFMAf16 $src3, $src1, $src2))>; def : Pat<(v4f32 (fma (v4f32 MQPR:$src1), (v4f32 MQPR:$src2), (v4f32 MQPR:$src3))), (v4f32 (MVE_VFMAf32 $src3, $src1, $src2))>; + def : Pat<(v8f16 (fma (fneg (v8f16 MQPR:$src1)), (v8f16 MQPR:$src2), (v8f16 MQPR:$src3))), + (v8f16 (MVE_VFMSf16 $src3, $src1, $src2))>; + def : Pat<(v4f32 (fma (fneg (v4f32 MQPR:$src1)), (v4f32 MQPR:$src2), (v4f32 MQPR:$src3))), + (v4f32 (MVE_VFMSf32 $src3, $src1, $src2))>; }