Index: lib/Target/AArch64/AArch64.td =================================================================== --- lib/Target/AArch64/AArch64.td +++ lib/Target/AArch64/AArch64.td @@ -94,6 +94,9 @@ def FeatureSlowPaired128 : SubtargetFeature<"slow-paired-128", "Paired128IsSlow", "true", "Paired 128 bit loads and stores are slow">; +def FeatureSlowSTRQro : SubtargetFeature<"slow-strqro-store", "STRQroIsSlow", + "true", "STR of Q register with register offset is slow">; + def FeatureAlternateSExtLoadCVTF32Pattern : SubtargetFeature< "alternate-sextload-cvt-f32-pattern", "UseAlternateSExtLoadCVTF32Pattern", "true", "Use alternative pattern for sextload convert to f32">; @@ -315,7 +318,8 @@ FeaturePredictableSelectIsExpensive, FeatureRDM, FeatureZCZeroing, - FeatureLSLFast + FeatureLSLFast, + FeatureSlowSTRQro ]>; def ProcThunderX2T99 : SubtargetFeature<"thunderx2t99", "ARMProcFamily", Index: lib/Target/AArch64/AArch64InstrFormats.td =================================================================== --- lib/Target/AArch64/AArch64InstrFormats.td +++ lib/Target/AArch64/AArch64InstrFormats.td @@ -3072,22 +3072,18 @@ multiclass Store128RO sz, bit V, bits<2> opc, RegisterClass regtype, string asm, ValueType Ty, SDPatternOperator storeop> { - let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in + let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in def roW : LoadStore128RO, + []>, Sched<[WriteSTIdx, ReadAdrBase]> { let Inst{13} = 0b0; } - let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in + let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in def roX : LoadStore128RO, + []>, Sched<[WriteSTIdx, ReadAdrBase]> { let Inst{13} = 0b1; } Index: lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.td +++ lib/Target/AArch64/AArch64InstrInfo.td @@ -330,6 +330,7 @@ let RecomputePerFunction = 1 in { def ForCodeSize : Predicate<"MF->getFunction()->optForSize()">; def NotForCodeSize : Predicate<"!MF->getFunction()->optForSize()">; + def UseSTRQro : Predicate<"!Subtarget->isSTRQroSlow() || MF->getFunction()->optForSize()">; } include "AArch64InstrFormats.td" @@ -2132,6 +2133,18 @@ defm STRD : Store64RO<0b11, 1, 0b00, FPR64, "str", f64, store>; defm STRQ : Store128RO<0b00, 1, 0b10, FPR128, "str", f128, store>; +// Avoid generating STRQro is if it is slow unless we're optimizing for code size. +let Predicates = [UseSTRQro], AddedComplexity = 10 in { + def : Pat<(store (f128 FPR128:$Rt), + (ro_Windexed128 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend128:$extend)), + (STRQroW FPR128:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend128:$extend)>; + def : Pat<(store (f128 FPR128:$Rt), + (ro_Xindexed128 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend128:$extend)), + (STRQroX FPR128:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Wextend128:$extend)>; +} + multiclass TruncStoreFrom64ROPat { @@ -2179,7 +2192,7 @@ defm : VecROStorePat; // Match all store 128 bits width whose type is compatible with FPR128 -let Predicates = [IsLE] in { +let Predicates = [IsLE, UseSTRQro] in { // We must use ST1 to store vectors in big-endian. defm : VecROStorePat; defm : VecROStorePat; Index: lib/Target/AArch64/AArch64Subtarget.h =================================================================== --- lib/Target/AArch64/AArch64Subtarget.h +++ lib/Target/AArch64/AArch64Subtarget.h @@ -100,6 +100,7 @@ bool UsePostRAScheduler = false; bool Misaligned128StoreIsSlow = false; bool Paired128IsSlow = false; + bool STRQroIsSlow = false; bool UseAlternateSExtLoadCVTF32Pattern = false; bool HasArithmeticBccFusion = false; bool HasArithmeticCbzFusion = false; @@ -217,6 +218,7 @@ bool hasCustomCheapAsMoveHandling() const { return CustomAsCheapAsMove; } bool isMisaligned128StoreSlow() const { return Misaligned128StoreIsSlow; } bool isPaired128Slow() const { return Paired128IsSlow; } + bool isSTRQroSlow() const { return STRQroIsSlow; } bool useAlternateSExtLoadCVTF32Pattern() const { return UseAlternateSExtLoadCVTF32Pattern; }