Index: lib/Target/ARM/ARM.td =================================================================== --- lib/Target/ARM/ARM.td +++ lib/Target/ARM/ARM.td @@ -132,6 +132,27 @@ def FeaturePrefISHSTBarrier : SubtargetFeature<"prefer-ishst", "PreferISHST", "true", "Prefer ISHST barriers">; +// Some targets (e.g. Cortex-A9) have muxed AGU and NEON/FPU. +def FeatureMuxedUnits : SubtargetFeature<"muxed-units", "HasMuxedUnits", "true", + "Has muxed AGU and NEON/FPU">; + +// On some targets, a VLDM/VSTM starting with an odd register number needs more +// microops than single VLDRS. +def FeatureSlowOddRegister : SubtargetFeature<"slow-odd-reg", "SlowOddRegister", + "true", "VLDM/VSTM starting with an odd register is slow">; + +// Some targets have a renaming dependency when loading into D subregisters. +def FeatureSlowLoadDSubreg : SubtargetFeature<"slow-load-D-subreg", + "SlowLoadDSubregister", "true", + "Loading into D subregs is slow">; + +// Some targets use a compact unwind format which assumes that VFPs are +// allocated with stride 4. This can expand the prologue, so it won't be taken +// into account when optimizing for size. +def FeatureUseStride4VFPs : SubtargetFeature<"use-stride4-vfps", + "UseStride4VFPs", "true", + "Allocate VFPs with stride 4">; + // Some targets (e.g. Cortex-A9) want to convert VMOVRS, VMOVSR and VMOVS from // VFP to NEON, as an execution domain optimization. def FeatureNEONForFPMovs : SubtargetFeature<"neon-fpmovs", "UseNEONForFPMovs", @@ -578,6 +599,7 @@ FeatureFP16, FeatureAvoidPartialCPSR, FeaturePreferVMOVSR, + FeatureMuxedUnits, FeatureNEONForFPMovs, FeatureCheckVLDnAlign, FeatureMP]>; @@ -598,6 +620,7 @@ // FIXME: A15 has currently the same Schedule model as A9. def : ProcessorModel<"cortex-a15", CortexA9Model, [ARMv7a, ProcA15, FeatureHasRetAddrStack, + FeatureMuxedUnits, FeatureTrustZone, FeatureT2XtPk, FeatureVFP4, @@ -626,6 +649,7 @@ // division features. def : ProcessorModel<"krait", CortexA9Model, [ARMv7a, ProcKrait, FeatureHasRetAddrStack, + FeatureMuxedUnits, FeatureCheckVLDnAlign, FeatureVMLxForwarding, FeatureT2XtPk, @@ -639,6 +663,7 @@ FeatureHasRetAddrStack, FeatureNEONForFP, FeatureT2XtPk, + FeatureUseStride4VFPs, FeatureVFP4, FeatureMP, FeatureHWDiv, @@ -648,6 +673,8 @@ FeatureHasSlowFPVMLx, FeatureProfUnpredicate, FeaturePrefISHSTBarrier, + FeatureSlowOddRegister, + FeatureSlowLoadDSubreg, FeatureSlowVGETLNi32, FeatureSlowVDUP32]>; Index: lib/Target/ARM/ARMHazardRecognizer.cpp =================================================================== --- lib/Target/ARM/ARMHazardRecognizer.cpp +++ lib/Target/ARM/ARMHazardRecognizer.cpp @@ -50,8 +50,7 @@ // Skip over one non-VFP / NEON instruction. if (!LastMI->isBarrier() && - // On A9, AGU and NEON/FPU are muxed. - !(TII.getSubtarget().isLikeA9() && LastMI->mayLoadOrStore()) && + !(TII.getSubtarget().hasMuxedUnits() && LastMI->mayLoadOrStore()) && (LastMCID.TSFlags & ARMII::DomainMask) == ARMII::DomainGeneral) { MachineBasicBlock::iterator I = LastMI; if (I != LastMI->getParent()->begin()) { Index: lib/Target/ARM/ARMLoadStoreOptimizer.cpp =================================================================== --- lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -982,7 +982,7 @@ bool CanMergeToLSMulti = true; // On swift vldm/vstm starting with an odd register number as that needs // more uops than single vldrs. - if (STI->isSwift() && !isNotVFP && (PRegNum % 2) == 1) + if (STI->hasSlowOddRegister() && !isNotVFP && (PRegNum % 2) == 1) CanMergeToLSMulti = false; // LDRD/STRD do not allow SP/PC. LDM/STM do not support it or have it Index: lib/Target/ARM/ARMSubtarget.h =================================================================== --- lib/Target/ARM/ARMSubtarget.h +++ lib/Target/ARM/ARMSubtarget.h @@ -249,6 +249,19 @@ /// If true, ISHST barriers will be used for Release semantics. bool PreferISHST = false; + /// If true, a VLDM/VSTM starting with an odd register number is considered to + /// take more microops than single VLDRS/VSTRS. + bool SlowOddRegister = false; + + /// If true, loading into a D subregister will be penalized. + bool SlowLoadDSubregister = false; + + /// If true, prefer to allocate VFPs with stride 4. + bool UseStride4VFPs = false; + + /// If true, the AGU and NEON/FPU units are multiplexed. + bool HasMuxedUnits = false; + /// If true, VMOVRS, VMOVSR and VMOVS will be converted from VFP to NEON. bool UseNEONForFPMovs = false; @@ -382,6 +395,9 @@ bool hasV8MBaselineOps() const { return HasV8MBaselineOps; } bool hasV8MMainlineOps() const { return HasV8MMainlineOps; } + /// @{ + /// These functions are obsolete, please consider adding subtarget features + /// or properties instead of calling them. bool isCortexA5() const { return ARMProcFamily == CortexA5; } bool isCortexA7() const { return ARMProcFamily == CortexA7; } bool isCortexA8() const { return ARMProcFamily == CortexA8; } @@ -392,6 +408,7 @@ bool isLikeA9() const { return isCortexA9() || isCortexA15() || isKrait(); } bool isCortexR5() const { return ARMProcFamily == CortexR5; } bool isKrait() const { return ARMProcFamily == Krait; } + /// @} bool hasARMOps() const { return !NoARM; } @@ -431,6 +448,9 @@ bool hasSlowVDUP32() const { return HasSlowVDUP32; } bool preferVMOVSR() const { return PreferVMOVSR; } bool preferISHSTBarriers() const { return PreferISHST; } + bool hasSlowOddRegister() const { return SlowOddRegister; } + bool hasSlowLoadDSubregister() const { return SlowLoadDSubregister; } + bool hasMuxedUnits() const { return HasMuxedUnits; } bool useNEONForFPMovs() const { return UseNEONForFPMovs; } bool checkVLDnAccessAlignment() const { return CheckVLDnAlign; } bool nonpipelinedVFP() const { return NonpipelinedVFP; } Index: lib/Target/ARM/ARMSubtarget.cpp =================================================================== --- lib/Target/ARM/ARMSubtarget.cpp +++ lib/Target/ARM/ARMSubtarget.cpp @@ -310,7 +310,8 @@ // For general targets, the prologue can grow when VFPs are allocated with // stride 4 (more vpush instructions). But WatchOS uses a compact unwind // format which it's more important to get right. - return isTargetWatchABI() || (isSwift() && !MF.getFunction()->optForMinSize()); + return isTargetWatchABI() || + (UseStride4VFPs && !MF.getFunction()->optForMinSize()); } bool ARMSubtarget::useMovt(const MachineFunction &MF) const { Index: lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.cpp +++ lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -259,10 +259,8 @@ unsigned Index) { // Penalize inserting into an D-subregister. We end up with a three times // lower estimated throughput on swift. - if (ST->isSwift() && - Opcode == Instruction::InsertElement && - ValTy->isVectorTy() && - ValTy->getScalarSizeInBits() <= 32) + if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement && + ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32) return 3; if ((Opcode == Instruction::InsertElement ||