Index: lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.h +++ lib/Target/AArch64/AArch64ISelLowering.h @@ -455,6 +455,9 @@ return true; } + /// Enable aggressive FMA fusion on targets that want it. + bool enableAggressiveFMAFusion(EVT VT) const override; + /// Returns the size of the platform's va_list object. unsigned getVaListSizeInBits(const DataLayout &DL) const override; Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -10978,6 +10978,11 @@ return OptSize && !VT.isVector(); } +bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const { + return Subtarget->getProcFamily() == AArch64Subtarget::ThunderX2T99 && + VT.isFloatingPoint(); +} + unsigned AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const { if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows()) Index: lib/Target/AArch64/AArch64SchedThunderX2T99.td =================================================================== --- lib/Target/AArch64/AArch64SchedThunderX2T99.td +++ lib/Target/AArch64/AArch64SchedThunderX2T99.td @@ -22,7 +22,7 @@ let LoadLatency = 4; // Optimistic load latency. let MispredictPenalty = 12; // Extra cycles for mispredicted branch. // Determined via a mix of micro-arch details and experimentation. - let LoopMicroOpBufferSize = 32; + let LoopMicroOpBufferSize = 128; let PostRAScheduler = 1; // Using PostRA sched. let CompleteModel = 1; @@ -391,7 +391,8 @@ def : WriteRes { let Latency = 1; } def : WriteRes { - let Unsupported = 1; + let Unsupported = 0; + let Latency = 4; let NumMicroOps = 2; }