diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td --- a/llvm/lib/Target/ARM/ARM.td +++ b/llvm/lib/Target/ARM/ARM.td @@ -303,6 +303,10 @@ def FeatureHasSlowFPVMLx : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true", "Disable VFP / NEON MAC instructions">; +// VFPv4 added VFMA instructions that can similar be fast or slow. +def FeatureHasSlowFPVFMx : SubtargetFeature<"slowfpvfmx", "SlowFPVFMx", "true", + "Disable VFP / NEON FMA instructions">; + // Cortex-A8 / A9 Advanced SIMD has multiplier accumulator forwarding. def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding", "HasVMLxForwarding", "true", @@ -588,6 +592,7 @@ FeatureHWDivThumb, FeatureHWDivARM, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureHasRetAddrStack, FeatureFuseLiterals, FeatureFuseAES, @@ -918,6 +923,7 @@ FeatureTrustZone, FeatureSlowFPBrcc, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureVMLxForwarding, FeatureMP, FeatureVFP4]>; @@ -928,6 +934,7 @@ FeatureSlowFPBrcc, FeatureHasVMLxHazards, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureVMLxForwarding, FeatureMP, FeatureVFP4, @@ -940,6 +947,7 @@ FeatureSlowFPBrcc, FeatureHasVMLxHazards, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureVMLxForwarding]>; def : ProcessorModel<"cortex-a9", CortexA9Model, [ARMv7a, ProcA9, @@ -1009,6 +1017,7 @@ FeatureAvoidPartialCPSR, FeatureAvoidMOVsShOp, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureHasVMLxHazards, FeatureProfUnpredicate, FeaturePrefISHSTBarrier, @@ -1027,6 +1036,7 @@ FeatureHasRetAddrStack, FeatureSlowFPBrcc, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureVFP3_D16, FeatureAvoidPartialCPSR]>; @@ -1036,6 +1046,7 @@ FeatureSlowFPBrcc, FeatureHWDivARM, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureAvoidPartialCPSR]>; def : ProcessorModel<"cortex-r7", CortexA8Model, [ARMv7r, ProcR7, @@ -1046,6 +1057,7 @@ FeatureSlowFPBrcc, FeatureHWDivARM, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureAvoidPartialCPSR]>; def : ProcessorModel<"cortex-r8", CortexA8Model, [ARMv7r, @@ -1056,6 +1068,7 @@ FeatureSlowFPBrcc, FeatureHWDivARM, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureAvoidPartialCPSR]>; def : ProcessorModel<"cortex-m3", CortexM4Model, [ARMv7m, @@ -1073,6 +1086,7 @@ FeatureVFP4_D16_SP, FeaturePrefLoopAlign32, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureUseMISched, FeatureHasNoBranchPredictor]>; @@ -1087,6 +1101,7 @@ FeatureFPARMv8_D16_SP, FeaturePrefLoopAlign32, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureUseMISched, FeatureHasNoBranchPredictor]>; @@ -1095,6 +1110,7 @@ FeatureFPARMv8_D16_SP, FeaturePrefLoopAlign32, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureUseMISched, FeatureHasNoBranchPredictor]>; @@ -1182,6 +1198,7 @@ FeatureAvoidPartialCPSR, FeatureAvoidMOVsShOp, FeatureHasSlowFPVMLx, + FeatureHasSlowFPVFMx, FeatureCrypto, FeatureUseMISched, FeatureZCZeroing, diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -15018,16 +15018,19 @@ /// patterns (and we don't have the non-fused floating point instruction). bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const { - if (!Subtarget->hasMVEFloatOps()) - return false; - if (!VT.isSimple()) return false; switch (VT.getSimpleVT().SimpleTy) { case MVT::v4f32: case MVT::v8f16: - return true; + return Subtarget->hasMVEFloatOps(); + case MVT::f16: + return Subtarget->useFPVFMx16(); + case MVT::f32: + return Subtarget->useFPVFMx(); + case MVT::f64: + return Subtarget->useFPVFMx64(); default: break; } diff --git a/llvm/lib/Target/ARM/ARMPredicates.td b/llvm/lib/Target/ARM/ARMPredicates.td --- a/llvm/lib/Target/ARM/ARMPredicates.td +++ b/llvm/lib/Target/ARM/ARMPredicates.td @@ -182,11 +182,9 @@ // But only select them if more precision in FP computation is allowed, and when // they are not slower than a mul + add sequence. // Do not use them for Darwin platforms. -def UseFusedMAC : Predicate<"(TM.Options.AllowFPOpFusion ==" - " FPOpFusion::Fast && " - " Subtarget->hasVFP4Base()) && " - "!Subtarget->isTargetDarwin() &&" - "Subtarget->useFPVMLx()">; +def UseFusedMAC : Predicate<"TM.Options.AllowFPOpFusion ==" + " FPOpFusion::Fast && " + "Subtarget->useFPVFMx()">; def HasFastVGETLNi32 : Predicate<"!Subtarget->hasSlowVGETLNi32()">; def HasSlowVGETLNi32 : Predicate<"Subtarget->hasSlowVGETLNi32()">; diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h --- a/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/llvm/lib/Target/ARM/ARMSubtarget.h @@ -203,6 +203,10 @@ /// whether the FP VML[AS] instructions are slow (if so, don't use them). bool SlowFPVMLx = false; + /// SlowFPVFMx - If the VFP4 / NEON instructions are available, indicates + /// whether the FP VFM[AS] instructions are slow (if so, don't use them). + bool SlowFPVFMx = false; + /// HasVMLxForwarding - If true, NEON has special multiplier accumulator /// forwarding to allow mul + mla being issued back to back. bool HasVMLxForwarding = false; @@ -632,6 +636,11 @@ bool useMulOps() const { return UseMulOps; } bool useFPVMLx() const { return !SlowFPVMLx; } + bool useFPVFMx() const { + return !isTargetDarwin() && hasVFP4Base() && !SlowFPVFMx; + } + bool useFPVFMx16() const { return useFPVFMx() && hasFullFP16(); } + bool useFPVFMx64() const { return useFPVFMx() && hasFP64(); } bool hasVMLxForwarding() const { return HasVMLxForwarding; } bool isFPBrccSlow() const { return SlowFPBrcc; } bool hasFP64() const { return HasFP64; } diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -69,15 +69,15 @@ ARM::FeatureDontWidenVMOVS, ARM::FeatureExpandMLx, ARM::FeatureHasVMLxHazards, ARM::FeatureNEONForFPMovs, ARM::FeatureNEONForFP, ARM::FeatureCheckVLDnAlign, - ARM::FeatureHasSlowFPVMLx, ARM::FeatureVMLxForwarding, - ARM::FeaturePref32BitThumb, ARM::FeatureAvoidPartialCPSR, - ARM::FeatureCheapPredicableCPSR, ARM::FeatureAvoidMOVsShOp, - ARM::FeatureHasRetAddrStack, ARM::FeatureHasNoBranchPredictor, - ARM::FeatureDSP, ARM::FeatureMP, ARM::FeatureVirtualization, - ARM::FeatureMClass, ARM::FeatureRClass, ARM::FeatureAClass, - ARM::FeatureNaClTrap, ARM::FeatureStrictAlign, ARM::FeatureLongCalls, - ARM::FeatureExecuteOnly, ARM::FeatureReserveR9, ARM::FeatureNoMovt, - ARM::FeatureNoNegativeImmediates + ARM::FeatureHasSlowFPVMLx, ARM::FeatureHasSlowFPVFMx, + ARM::FeatureVMLxForwarding, ARM::FeaturePref32BitThumb, + ARM::FeatureAvoidPartialCPSR, ARM::FeatureCheapPredicableCPSR, + ARM::FeatureAvoidMOVsShOp, ARM::FeatureHasRetAddrStack, + ARM::FeatureHasNoBranchPredictor, ARM::FeatureDSP, ARM::FeatureMP, + ARM::FeatureVirtualization, ARM::FeatureMClass, ARM::FeatureRClass, + ARM::FeatureAClass, ARM::FeatureNaClTrap, ARM::FeatureStrictAlign, + ARM::FeatureLongCalls, ARM::FeatureExecuteOnly, ARM::FeatureReserveR9, + ARM::FeatureNoMovt, ARM::FeatureNoNegativeImmediates }; const ARMSubtarget *getST() const { return ST; } diff --git a/llvm/test/CodeGen/ARM/cortex-a57-misched-vfma.ll b/llvm/test/CodeGen/ARM/cortex-a57-misched-vfma.ll --- a/llvm/test/CodeGen/ARM/cortex-a57-misched-vfma.ll +++ b/llvm/test/CodeGen/ARM/cortex-a57-misched-vfma.ll @@ -93,12 +93,12 @@ ; CHECK-SAME: Latency=0 ; CHECK-DEFAULT: VMLSS -; CHECK-FAST: VFMSS -; > VMLSS common latency = 9 +; CHECK-FAST: VFNMSS +; > VFNMSS common latency = 9 ; CHECK: Latency : 9 ; CHECK: Successors: ; CHECK: Data -; > VMLSS read-advanced latency to the next VMLSS = 4 +; > VFNMSS read-advanced latency to the next VMLSS = 4 ; CHECK-SAME: Latency=4 ; CHECK-DEFAULT: VMLSS diff --git a/llvm/test/CodeGen/ARM/fp16-fullfp16.ll b/llvm/test/CodeGen/ARM/fp16-fullfp16.ll --- a/llvm/test/CodeGen/ARM/fp16-fullfp16.ll +++ b/llvm/test/CodeGen/ARM/fp16-fullfp16.ll @@ -571,7 +571,7 @@ ; CHECK: vldr.16 s0, [r1] ; CHECK-NEXT: vldr.16 s2, [r0] ; CHECK-NEXT: vldr.16 s4, [r2] -; CHECK-NEXT: vmla.f16 s4, s2, s0 +; CHECK-NEXT: vfma.f16 s4, s2, s0 ; CHECK-NEXT: vstr.16 s4, [r0] ; CHECK-NEXT: bx lr %a = load half, half* %p, align 2 diff --git a/llvm/test/CodeGen/ARM/fp16-fusedMAC.ll b/llvm/test/CodeGen/ARM/fp16-fusedMAC.ll --- a/llvm/test/CodeGen/ARM/fp16-fusedMAC.ll +++ b/llvm/test/CodeGen/ARM/fp16-fusedMAC.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=thumbv8.1-m-none-eabi -mattr=+fullfp16 -fp-contract=fast | FileCheck %s -; RUN: llc < %s -mtriple=thumbv8.1-m-none-eabi -mattr=+fullfp16,+slowfpvmlx -fp-contract=fast | FileCheck %s -check-prefix=DONT-FUSE +; RUN: llc < %s -mtriple=thumbv8.1-m-none-eabi -mattr=+fullfp16,+slowfpvfmx -fp-contract=fast | FileCheck %s -check-prefix=DONT-FUSE ; Check generated fp16 fused MAC and MLS. diff --git a/llvm/test/CodeGen/Thumb2/float-intrinsics-double.ll b/llvm/test/CodeGen/Thumb2/float-intrinsics-double.ll --- a/llvm/test/CodeGen/Thumb2/float-intrinsics-double.ll +++ b/llvm/test/CodeGen/Thumb2/float-intrinsics-double.ll @@ -201,7 +201,7 @@ ; SOFT: bl __aeabi_dadd ; VFP4: vmul.f64 ; VFP4: vadd.f64 -; FP-ARMv8: vmla.f64 +; FP-ARMv8: vfma.f64 %1 = call double @llvm.fmuladd.f64(double %a, double %b, double %c) ret double %1 } diff --git a/llvm/test/CodeGen/Thumb2/float-intrinsics-float.ll b/llvm/test/CodeGen/Thumb2/float-intrinsics-float.ll --- a/llvm/test/CodeGen/Thumb2/float-intrinsics-float.ll +++ b/llvm/test/CodeGen/Thumb2/float-intrinsics-float.ll @@ -194,7 +194,7 @@ ; CHECK-LABEL: fmuladd_f: ; SOFT: bl __aeabi_fmul ; SOFT: bl __aeabi_fadd -; VMLA: vmla.f32 +; VMLA: vfma.f32 ; NO-VMLA: vmul.f32 ; NO-VMLA: vadd.f32 %1 = call float @llvm.fmuladd.f32(float %a, float %b, float %c)