Index: lib/Target/AArch64/AArch64.td =================================================================== --- lib/Target/AArch64/AArch64.td +++ lib/Target/AArch64/AArch64.td @@ -94,6 +94,9 @@ def FeatureSlowPaired128 : SubtargetFeature<"slow-paired-128", "Paired128IsSlow", "true", "Paired 128 bit loads and stores are slow">; +def FeatureSlowSTRQro : SubtargetFeature<"slow-strqro-store", "STRQroIsSlow", + "true", "STR of Q register with register offset is slow">; + def FeatureAlternateSExtLoadCVTF32Pattern : SubtargetFeature< "alternate-sextload-cvt-f32-pattern", "UseAlternateSExtLoadCVTF32Pattern", "true", "Use alternative pattern for sextload convert to f32">; @@ -315,7 +318,8 @@ FeaturePredictableSelectIsExpensive, FeatureRDM, FeatureZCZeroing, - FeatureLSLFast + FeatureLSLFast, + FeatureSlowSTRQro ]>; def ProcThunderX2T99 : SubtargetFeature<"thunderx2t99", "ARMProcFamily", Index: lib/Target/AArch64/AArch64InstrFormats.td =================================================================== --- lib/Target/AArch64/AArch64InstrFormats.td +++ lib/Target/AArch64/AArch64InstrFormats.td @@ -3072,22 +3072,18 @@ multiclass Store128RO sz, bit V, bits<2> opc, RegisterClass regtype, string asm, ValueType Ty, SDPatternOperator storeop> { - let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in + let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in def roW : LoadStore128RO, + []>, Sched<[WriteSTIdx, ReadAdrBase]> { let Inst{13} = 0b0; } - let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in + let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in def roX : LoadStore128RO, + []>, Sched<[WriteSTIdx, ReadAdrBase]> { let Inst{13} = 0b1; } Index: lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.td +++ lib/Target/AArch64/AArch64InstrInfo.td @@ -330,6 +330,8 @@ let RecomputePerFunction = 1 in { def ForCodeSize : Predicate<"MF->getFunction()->optForSize()">; def NotForCodeSize : Predicate<"!MF->getFunction()->optForSize()">; + // Avoid generating STRQro if it is slow, unless we're optimizing for code size. + def UseSTRQro : Predicate<"!Subtarget->isSTRQroSlow() || MF->getFunction()->optForSize()">; } include "AArch64InstrFormats.td" @@ -2132,6 +2134,17 @@ defm STRD : Store64RO<0b11, 1, 0b00, FPR64, "str", f64, store>; defm STRQ : Store128RO<0b00, 1, 0b10, FPR128, "str", f128, store>; +let Predicates = [UseSTRQro], AddedComplexity = 10 in { + def : Pat<(store (f128 FPR128:$Rt), + (ro_Windexed128 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend128:$extend)), + (STRQroW FPR128:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend128:$extend)>; + def : Pat<(store (f128 FPR128:$Rt), + (ro_Xindexed128 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend128:$extend)), + (STRQroX FPR128:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Wextend128:$extend)>; +} + multiclass TruncStoreFrom64ROPat { @@ -2179,7 +2192,7 @@ defm : VecROStorePat; // Match all store 128 bits width whose type is compatible with FPR128 -let Predicates = [IsLE] in { +let Predicates = [IsLE, UseSTRQro] in { // We must use ST1 to store vectors in big-endian. defm : VecROStorePat; defm : VecROStorePat; Index: lib/Target/AArch64/AArch64Subtarget.h =================================================================== --- lib/Target/AArch64/AArch64Subtarget.h +++ lib/Target/AArch64/AArch64Subtarget.h @@ -100,6 +100,7 @@ bool UsePostRAScheduler = false; bool Misaligned128StoreIsSlow = false; bool Paired128IsSlow = false; + bool STRQroIsSlow = false; bool UseAlternateSExtLoadCVTF32Pattern = false; bool HasArithmeticBccFusion = false; bool HasArithmeticCbzFusion = false; @@ -217,6 +218,7 @@ bool hasCustomCheapAsMoveHandling() const { return CustomAsCheapAsMove; } bool isMisaligned128StoreSlow() const { return Misaligned128StoreIsSlow; } bool isPaired128Slow() const { return Paired128IsSlow; } + bool isSTRQroSlow() const { return STRQroIsSlow; } bool useAlternateSExtLoadCVTF32Pattern() const { return UseAlternateSExtLoadCVTF32Pattern; } Index: test/CodeGen/AArch64/strqro.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/strqro.ll @@ -0,0 +1,47 @@ +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-STRQRO %s +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mcpu=falkor | FileCheck --check-prefix=CHECK --check-prefix=CHECK-NOSTRQRO %s + +; CHECK-LABEL: strqrox: +; CHECK-STRQRO: str q{{[0-9]+}}, [x{{[0-9]+}}, x +; CHECK-NOSTRQRO-NOT: str q{{[0-9]+}}, [x{{[0-9]+}}, x +define void @strqrox(fp128 %val64, i64 %base, i64 %offset) { + %addrint = add i64 %base, %offset + %addr = inttoptr i64 %addrint to fp128* + store volatile fp128 %val64, fp128* %addr + ret void +} + +; Check that STRQro is generated for both cases if we're optimizing for code size. +; CHECK-LABEL: strqrox_optsize: +; CHECK-STRQRO: str q{{[0-9]+}}, [x{{[0-9]+}}, x +; CHECK-NOSTRQRO: str q{{[0-9]+}}, [x{{[0-9]+}}, x +define void @strqrox_optsize(fp128 %val64, i64 %base, i64 %offset) minsize { + %addrint = add i64 %base, %offset + %addr = inttoptr i64 %addrint to fp128* + store volatile fp128 %val64, fp128* %addr + ret void +} + +; CHECK-LABEL: strqrow: +; CHECK-STRQRO: str q{{[0-9]+}}, [x{{[0-9]+}}, w +; CHECK-NOSTRQRO-NOT: str q{{[0-9]+}}, [x{{[0-9]+}}, w +define void @strqrow(fp128 %val64, i64 %base, i32 %offset) { + %offset64 = zext i32 %offset to i64 + %addrint = add i64 %base, %offset64 + %addr = inttoptr i64 %addrint to fp128* + store volatile fp128 %val64, fp128* %addr + ret void +} + +; Check that STRQro is generated for both cases if we're optimizing for code size. +; CHECK-LABEL: strqrow_optsize: +; CHECK-STRQRO: str q{{[0-9]+}}, [x{{[0-9]+}}, w +; CHECK-NOSTRQRO: str q{{[0-9]+}}, [x{{[0-9]+}}, w +define void @strqrow_optsize(fp128 %val64, i64 %base, i32 %offset) minsize { + %offset64 = zext i32 %offset to i64 + %addrint = add i64 %base, %offset64 + %addr = inttoptr i64 %addrint to fp128* + store volatile fp128 %val64, fp128* %addr + ret void +} +