diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -738,16 +738,7 @@ SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl &Created) const override; - /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster - /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be - /// expanded to FMAs when this method returns true, otherwise fmuladd is - /// expanded to fmul + fadd. - /// - /// ARM supports both fused and unfused multiply-add operations; we already - /// lower a pair of fmul and fadd to the latter so it's not clear that there - /// would be a gain or that the gain would be worthwhile enough to risk - /// correctness bugs. - bool isFMAFasterThanFMulAndFAdd(EVT VT) const override { return false; } + bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -14858,6 +14858,36 @@ return -1; } +/// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster +/// than a pair of fmul and fadd instructions. fmuladd intrinsics will be +/// expanded to FMAs when this method returns true, otherwise fmuladd is +/// expanded to fmul + fadd. +/// +/// ARM supports both fused and unfused multiply-add operations; we already +/// lower a pair of fmul and fadd to the latter so it's not clear that there +/// would be a gain or that the gain would be worthwhile enough to risk +/// correctness bugs. +/// +/// For MVE, we set this to true as it helps simplify the need for some +/// patterns (and we don't have the non-fused floating point instruction). +bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { + if (!Subtarget->hasMVEFloatOps()) + return false; + + if (!VT.isSimple()) + return false; + + switch (VT.getSimpleVT().SimpleTy) { + case MVT::v4f32: + case MVT::v8f16: + return true; + default: + break; + } + + return false; +} + static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { if (V < 0) return false; diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -2808,31 +2808,15 @@ def MVE_VFMSf16 : MVE_VADDSUBFMA_fp<"vfms", "f16", 0b1, 0b1, 0b0, 0b1, (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">; -let Predicates = [HasMVEFloat, UseFusedMAC] in { - def : Pat<(v8f16 (fadd (v8f16 MQPR:$src1), - (fmul (v8f16 MQPR:$src2), - (v8f16 MQPR:$src3)))), - (v8f16 (MVE_VFMAf16 $src1, $src2, $src3))>; - def : Pat<(v4f32 (fadd (v4f32 MQPR:$src1), - (fmul (v4f32 MQPR:$src2), - (v4f32 MQPR:$src3)))), - (v4f32 (MVE_VFMAf32 $src1, $src2, $src3))>; - - def : Pat<(v8f16 (fsub (v8f16 MQPR:$src1), - (fmul (v8f16 MQPR:$src2), - (v8f16 MQPR:$src3)))), - (v8f16 (MVE_VFMSf16 $src1, $src2, $src3))>; - def : Pat<(v4f32 (fsub (v4f32 MQPR:$src1), - (fmul (v4f32 MQPR:$src2), - (v4f32 MQPR:$src3)))), - (v4f32 (MVE_VFMSf32 $src1, $src2, $src3))>; -} - let Predicates = [HasMVEFloat] in { def : Pat<(v8f16 (fma (v8f16 MQPR:$src1), (v8f16 MQPR:$src2), (v8f16 MQPR:$src3))), (v8f16 (MVE_VFMAf16 $src3, $src1, $src2))>; def : Pat<(v4f32 (fma (v4f32 MQPR:$src1), (v4f32 MQPR:$src2), (v4f32 MQPR:$src3))), (v4f32 (MVE_VFMAf32 $src3, $src1, $src2))>; + def : Pat<(v8f16 (fma (fneg (v8f16 MQPR:$src1)), (v8f16 MQPR:$src2), (v8f16 MQPR:$src3))), + (v8f16 (MVE_VFMSf16 $src3, $src1, $src2))>; + def : Pat<(v4f32 (fma (fneg (v4f32 MQPR:$src1)), (v4f32 MQPR:$src2), (v4f32 MQPR:$src3))), + (v4f32 (MVE_VFMSf32 $src3, $src1, $src2))>; } multiclass MVE_VADDSUB_fp_mThis Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vctp.32 r2 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q1, [r0] -; CHECK-NEXT: vldrwt.u32 q2, [r1] +; CHECK-NEXT: vldrwt.u32 q2, [r0] +; CHECK-NEXT: vldrwt.u32 q3, [r1] ; CHECK-NEXT: mov r3, r2 -; CHECK-NEXT: vmul.f32 q1, q2, q1 ; CHECK-NEXT: adds r0, #16 ; CHECK-NEXT: adds r1, #16 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.f32 q1, q1, q0 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vfma.f32 q0, q3, q2 ; CHECK-NEXT: le lr, .LBB1_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block ; CHECK-NEXT: vctp.32 r3 -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov.f32 s5, s3 ; CHECK-NEXT: vadd.f32 q0, q0, q1