Index: lib/Target/AArch64/AArch64.td =================================================================== --- lib/Target/AArch64/AArch64.td +++ lib/Target/AArch64/AArch64.td @@ -58,6 +58,50 @@ "Reserve X18, making it unavailable " "as a GPR">; +def FeatureMergeNarrowLd : SubtargetFeature<"merge-narrow-ld", + "MergeNarrowLoads", "true", + "Merge narrow load instructions">; + +def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true", + "Use alias analysis during codegen">; + +def FeatureBalanceFPOps : SubtargetFeature<"balance-fp-ops", "BalanceFPOps", + "true", + "balance mix of odd and even D-registers for fp multiply(-accumulate) ops">; + +def FeaturePredictableSelectIsExpensive : SubtargetFeature< + "predictable-select-expensive", "PredictableSelectIsExpensive", "true", + "Prefer likely predicted branches over selects">; + +def FeatureCustomCheapAsMoveHandling : SubtargetFeature<"custom-cheap-as-move", + "CustomAsCheapAsMove", "true", + "Use custom code for TargetInstrInfo::isAsCheapAsAMove()">; + +def FeaturePostRAScheduler : SubtargetFeature<"use-postra-scheduler", + "UsePostRAScheduler", "true", "Schedule again after register allocation">; + +def FeatureSlowMisaligned128Store : SubtargetFeature<"slow-misaligned-128store", + "Misaligned128StoreIsSlow", "true", "Misaligned 128 bit stores are slow">; + +def FeatureAvoidQuadLdStPairs : SubtargetFeature<"no-quad-ldst-pairs", + "AvoidQuadLdStPairs", "true", + "Do not form quad load/store pair operations">; + +def FeatureAlternateSExtLoadCVTF32Pattern : SubtargetFeature< + "alternate-sextload-cvt-f32-pattern", "UseAlternateSExtLoadCVTF32Pattern", + "true", "Use alternative pattern for sextload convert to f32">; + +def FeatureMacroOpFusion : SubtargetFeature< + "macroop-fusion", "HasMacroOpFusion", "true", + "CPU supports macro op fusion">; + +def FeatureDisableLatencySchedHeuristic : SubtargetFeature< + "disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true", + "Disable latency scheduling heuristic">; + +def FeatureUseRSqrt : SubtargetFeature< + "use-reverse-square-root", "UseRSqrt", "true", "Use reverse square root">; + //===----------------------------------------------------------------------===// // Architectures. // @@ -94,57 +138,87 @@ include "AArch64SchedKryo.td" def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35", - "Cortex-A35 ARM processors", - [FeatureFPARMv8, - FeatureNEON, - FeatureCrypto, + "Cortex-A35 ARM processors", [ FeatureCRC, - FeaturePerfMon]>; + FeatureCrypto, + FeatureFPARMv8, + FeatureNEON, + FeaturePerfMon + ]>; def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53", - "Cortex-A53 ARM processors", - [FeatureFPARMv8, - FeatureNEON, - FeatureCrypto, + "Cortex-A53 ARM processors", [ + FeatureBalanceFPOps, FeatureCRC, - FeaturePerfMon]>; + FeatureCrypto, + FeatureCustomCheapAsMoveHandling, + FeatureFPARMv8, + FeatureNEON, + FeaturePerfMon, + FeaturePostRAScheduler, + FeatureUseAA + ]>; def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57", - "Cortex-A57 ARM processors", - [FeatureFPARMv8, - FeatureNEON, - FeatureCrypto, + "Cortex-A57 ARM processors", [ + FeatureBalanceFPOps, FeatureCRC, - FeaturePerfMon]>; + FeatureCrypto, + FeatureCustomCheapAsMoveHandling, + FeatureFPARMv8, + FeatureMergeNarrowLd, + FeatureNEON, + FeaturePerfMon, + FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive + ]>; def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone", - "Cyclone", - [FeatureFPARMv8, - FeatureNEON, + "Cyclone", [ + FeatureAlternateSExtLoadCVTF32Pattern, FeatureCrypto, + FeatureDisableLatencySchedHeuristic, + FeatureFPARMv8, + FeatureMacroOpFusion, + FeatureNEON, FeaturePerfMon, - FeatureZCRegMove, FeatureZCZeroing]>; + FeatureSlowMisaligned128Store, + FeatureZCRegMove, + FeatureZCZeroing + ]>; def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1", - "Samsung Exynos-M1 processors", - [FeatureFPARMv8, - FeatureNEON, - FeatureCrypto, + "Samsung Exynos-M1 processors", [ + FeatureAvoidQuadLdStPairs, FeatureCRC, - FeaturePerfMon]>; + FeatureCrypto, + FeatureCustomCheapAsMoveHandling, + FeatureFPARMv8, + FeatureNEON, + FeaturePerfMon, + FeatureUseRSqrt + ]>; def ProcKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo", - "Qualcomm Kryo processors", - [FeatureFPARMv8, - FeatureNEON, - FeatureCrypto, + "Qualcomm Kryo processors", [ FeatureCRC, - FeaturePerfMon]>; - -def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8, - FeatureNEON, - FeatureCRC, - FeaturePerfMon]>; + FeatureCrypto, + FeatureCustomCheapAsMoveHandling, + FeatureFPARMv8, + FeatureMergeNarrowLd, + FeatureNEON, + FeaturePerfMon, + FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive + ]>; + +def : ProcessorModel<"generic", NoSchedModel, [ + FeatureCRC, + FeatureFPARMv8, + FeatureNEON, + FeaturePerfMon, + FeaturePostRAScheduler + ]>; // FIXME: Cortex-A35 is currently modelled as a Cortex-A53 def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>; Index: lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp =================================================================== --- lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp +++ lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp @@ -314,9 +314,7 @@ if (skipFunction(*F.getFunction())) return false; - // Don't do anything if this isn't an A53 or A57. - if (!(F.getSubtarget().isCortexA53() || - F.getSubtarget().isCortexA57())) + if (!F.getSubtarget().balanceFPOps()) return false; bool Changed = false; Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -634,9 +634,7 @@ } } - // Prefer likely predicted branches to selects on out-of-order cores. - if (Subtarget->isCortexA57() || Subtarget->isKryo()) - PredictableSelectIsExpensive = true; + PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); } void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) { @@ -814,12 +812,9 @@ if (Subtarget->requiresStrictAlign()) return false; - // FIXME: This is mostly true for Cyclone, but not necessarily others. if (Fast) { - // FIXME: Define an attribute for slow unaligned accesses instead of - // relying on the CPU type as a proxy. - // On Cyclone, unaligned 128-bit stores are slow. - *Fast = !Subtarget->isCyclone() || VT.getStoreSize() != 16 || + // Some CPUs are fine with unaligned stores except for 128-bit ones. + *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 || // See comments in performSTORECombine() for more details about // these conditions. @@ -8812,9 +8807,7 @@ // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be // a call to that function here. - // Cyclone has bad performance on unaligned 16B stores when crossing line and - // page boundaries. We want to split such stores. - if (!Subtarget->isCyclone()) + if (!Subtarget->isMisaligned128StoreSlow()) return SDValue(); // Don't split at -Oz. Index: lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.cpp +++ lib/Target/AArch64/AArch64InstrInfo.cpp @@ -544,8 +544,7 @@ // FIXME: this implementation should be micro-architecture dependent, so a // micro-architecture target hook should be introduced here in future. bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const { - if (!Subtarget.isCortexA57() && !Subtarget.isCortexA53() && - !Subtarget.isExynosM1() && !Subtarget.isKryo()) + if (!Subtarget.hasCustomCheapAsMoveHandling()) return MI->isAsCheapAsAMove(); unsigned Imm; @@ -559,7 +558,7 @@ case AArch64::ADDXri: case AArch64::SUBWri: case AArch64::SUBXri: - return (Subtarget.isExynosM1() || + return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 || MI->getOperand(3).getImm() == 0); // add/sub on register with shift @@ -568,7 +567,7 @@ case AArch64::SUBWrs: case AArch64::SUBXrs: Imm = MI->getOperand(3).getImm(); - return (Subtarget.isExynosM1() && + return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 && AArch64_AM::getArithShiftValue(Imm) < 4); // logical ops on immediate @@ -609,7 +608,7 @@ case AArch64::ORRWrs: case AArch64::ORRXrs: Imm = MI->getOperand(3).getImm(); - return (Subtarget.isExynosM1() && + return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 && AArch64_AM::getShiftValue(Imm) < 4 && AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL); @@ -1522,8 +1521,8 @@ if (isLdStPairSuppressed(MI)) return false; - // Do not pair quad ld/st for Exynos. - if (Subtarget.isExynosM1()) { + // On some CPUs quad load/store pairs are slower than two single load/stores. + if (Subtarget.avoidQuadLdStPairs()) { switch (MI->getOpcode()) { default: break; @@ -1801,8 +1800,8 @@ bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First, MachineInstr *Second) const { - if (Subtarget.isCyclone()) { - // Cyclone can fuse CMN, CMP, TST followed by Bcc. + if (Subtarget.hasMacroOpFusion()) { + // Fuse CMN, CMP, TST followed by Bcc. unsigned SecondOpcode = Second->getOpcode(); if (SecondOpcode == AArch64::Bcc) { switch (First->getOpcode()) { @@ -1817,7 +1816,7 @@ return true; } } - // Cyclone B0 also supports ALU operations followed by CBZ/CBNZ. + // Fuse ALU operations followed by CBZ/CBNZ. if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX || SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) { switch (First->getOpcode()) { Index: lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.td +++ lib/Target/AArch64/AArch64InstrInfo.td @@ -34,7 +34,8 @@ def IsLE : Predicate<"Subtarget->isLittleEndian()">; def IsBE : Predicate<"!Subtarget->isLittleEndian()">; -def IsCyclone : Predicate<"Subtarget->isCyclone()">; +def UseAlternateSExtLoadCVTF32 + : Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">; //===----------------------------------------------------------------------===// // AArch64-specific DAG Nodes. @@ -4957,7 +4958,8 @@ 0), dsub)), 0), - ssub)))>, Requires<[NotForCodeSize, IsCyclone]>; + ssub)))>, + Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>; def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext), (LDRBroW GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>; @@ -5010,7 +5012,8 @@ 0), dsub)), 0), - dsub)))>, Requires<[NotForCodeSize, IsCyclone]>; + dsub)))>, + Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>; def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext), (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>; Index: lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp =================================================================== --- lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -160,10 +160,6 @@ // Find and promote load instructions which read directly from store. bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI); - // Check if converting two narrow loads into a single wider load with - // bitfield extracts could be enabled. - bool enableNarrowLdMerge(MachineFunction &Fn); - bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt); bool runOnMachineFunction(MachineFunction &Fn) override; @@ -1912,15 +1908,6 @@ return Modified; } -bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) { - bool ProfitableArch = Subtarget->isCortexA57() || Subtarget->isKryo(); - // FIXME: The benefit from converting narrow loads into a wider load could be - // microarchitectural as it assumes that a single load with two bitfield - // extracts is cheaper than two narrow loads. Currently, this conversion is - // enabled only in cortex-a57 on which performance benefits were verified. - return ProfitableArch && !Subtarget->requiresStrictAlign(); -} - bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { if (skipFunction(*Fn.getFunction())) return false; @@ -1936,7 +1923,8 @@ UsedRegs.resize(TRI->getNumRegs()); bool Modified = false; - bool enableNarrowLdOpt = enableNarrowLdMerge(Fn); + bool enableNarrowLdOpt = + Subtarget->mergeNarrowLoads() && !Subtarget->requiresStrictAlign(); for (auto &MBB : Fn) Modified |= optimizeBlock(MBB, enableNarrowLdOpt); Index: lib/Target/AArch64/AArch64Subtarget.h =================================================================== --- lib/Target/AArch64/AArch64Subtarget.h +++ lib/Target/AArch64/AArch64Subtarget.h @@ -33,8 +33,8 @@ class Triple; class AArch64Subtarget : public AArch64GenSubtargetInfo { -protected: - enum ARMProcFamilyEnum { +public: + enum ARMProcFamilyEnum : uint8_t { Others, CortexA35, CortexA53, @@ -44,6 +44,7 @@ Kryo }; +protected: /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others. ARMProcFamilyEnum ARMProcFamily = Others; @@ -66,6 +67,24 @@ // StrictAlign - Disallow unaligned memory accesses. bool StrictAlign = false; + bool MergeNarrowLoads = false; + bool UseAA = false; + bool PredictableSelectIsExpensive = false; + bool BalanceFPOps = false; + bool CustomAsCheapAsMove = false; + bool UsePostRAScheduler = false; + bool Misaligned128StoreIsSlow = false; + bool AvoidQuadLdStPairs = false; + bool UseAlternateSExtLoadCVTF32Pattern = false; + bool HasMacroOpFusion = false; + bool DisableLatencySchedHeuristic = false; + bool UseRSqrt = false; + uint8_t MaxInterleaveFactor = 2; + uint8_t VectorInsertExtractBaseCost = 3; + uint16_t CacheLineSize = 0; + uint16_t PrefetchDistance = 0; + uint16_t MinPrefetchStride = 1; + unsigned MaxPrefetchIterationsAhead = UINT_MAX; // ReserveX18 - X18 is not available as a general purpose register. bool ReserveX18; @@ -93,6 +112,9 @@ /// subtarget initialization. AArch64Subtarget &initializeSubtargetDependencies(StringRef FS); + /// Initialize properties based on the selected processor family. + void initializeProperties(); + public: /// This constructor initializes the data members to match that /// of the specified triple. @@ -123,7 +145,15 @@ const Triple &getTargetTriple() const { return TargetTriple; } bool enableMachineScheduler() const override { return true; } bool enablePostRAScheduler() const override { - return isGeneric() || isCortexA53() || isCortexA57() || isKryo(); + return UsePostRAScheduler; + } + + /// Returns ARM processor family. + /// Avoid this function! CPU specifics should be kept local to this class + /// and preferably modeled with SubtargetFeatures or properties in + /// initializeProperties(). + ARMProcFamilyEnum getProcFamily() const { + return ARMProcFamily; } bool hasV8_1aOps() const { return HasV8_1aOps; } @@ -140,6 +170,30 @@ bool hasNEON() const { return HasNEON; } bool hasCrypto() const { return HasCrypto; } bool hasCRC() const { return HasCRC; } + bool mergeNarrowLoads() const { return MergeNarrowLoads; } + bool balanceFPOps() const { return BalanceFPOps; } + bool predictableSelectIsExpensive() const { + return PredictableSelectIsExpensive; + } + bool hasCustomCheapAsMoveHandling() const { return CustomAsCheapAsMove; } + bool isMisaligned128StoreSlow() const { return Misaligned128StoreIsSlow; } + bool avoidQuadLdStPairs() const { return AvoidQuadLdStPairs; } + bool useAlternateSExtLoadCVTF32Pattern() const { + return UseAlternateSExtLoadCVTF32Pattern; + } + bool hasMacroOpFusion() const { return HasMacroOpFusion; } + bool useRSqrt() const { return UseRSqrt; } + unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; } + unsigned getVectorInsertExtractBaseCost() const { + return VectorInsertExtractBaseCost; + } + unsigned getCacheLineSize() const { return CacheLineSize; } + unsigned getPrefetchDistance() const { return PrefetchDistance; } + unsigned getMinPrefetchStride() const { return MinPrefetchStride; } + unsigned getMaxPrefetchIterationsAhead() const { + return MaxPrefetchIterationsAhead; + } + /// CPU has TBI (top byte of addresses is ignored during HW address /// translation) and OS enables it. bool supportsAddressTopByteIgnored() const; @@ -160,14 +214,7 @@ bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); } - bool isGeneric() const { return CPUString == "generic"; } - bool isCyclone() const { return CPUString == "cyclone"; } - bool isCortexA57() const { return CPUString == "cortex-a57"; } - bool isCortexA53() const { return CPUString == "cortex-a53"; } - bool isExynosM1() const { return CPUString == "exynos-m1"; } - bool isKryo() const { return CPUString == "kryo"; } - - bool useAA() const override { return isCortexA53(); } + bool useAA() const override { return UseAA; } /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size /// that still makes it profitable to inline the call. Index: lib/Target/AArch64/AArch64Subtarget.cpp =================================================================== --- lib/Target/AArch64/AArch64Subtarget.cpp +++ lib/Target/AArch64/AArch64Subtarget.cpp @@ -44,9 +44,36 @@ CPUString = "generic"; ParseSubtargetFeatures(CPUString, FS); + initializeProperties(); + return *this; } +void AArch64Subtarget::initializeProperties() { + // Initialize CPU specific properties. We should add a tablegen feature for + // this in the future so we can specify it together with the subtarget + // features. + switch (ARMProcFamily) { + case Cyclone: + CacheLineSize = 64; + PrefetchDistance = 280; + MinPrefetchStride = 2048; + MaxPrefetchIterationsAhead = 3; + break; + case Others: + case CortexA35: + case CortexA53: + case CortexA57: + MaxInterleaveFactor = 4; + break; + case ExynosM1: + case Kryo: + MaxInterleaveFactor = 4; + VectorInsertExtractBaseCost = 2; + break; + } +} + AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const TargetMachine &TM, bool LittleEndian) @@ -115,8 +142,7 @@ // Enabling or Disabling the latency heuristic is a close call: It seems to // help nearly no benchmark on out-of-order architectures, on the other hand // it regresses register pressure on a few benchmarking. - if (isCyclone()) - Policy.DisableLatencyHeuristic = true; + Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic; } bool AArch64Subtarget::enableEarlyIfConversion() const { @@ -138,8 +164,5 @@ std::unique_ptr AArch64Subtarget::getCustomPBQPConstraints() const { - if (!isCortexA57()) - return nullptr; - - return llvm::make_unique(); + return balanceFPOps() ? llvm::make_unique() : nullptr; } Index: lib/Target/AArch64/AArch64TargetMachine.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetMachine.cpp +++ lib/Target/AArch64/AArch64TargetMachine.cpp @@ -147,8 +147,7 @@ // (52 mantissa bits) are 2 and 3, respectively. unsigned ExtraStepsF = 2, ExtraStepsD = ExtraStepsF + 1; - // FIXME: Enable x^-1/2 only for Exynos M1 at the moment. - bool UseRsqrt = ST.isExynosM1(); + bool UseRsqrt = ST.useRSqrt(); TM.Options.Reciprocals.setDefaults("sqrtf", UseRsqrt, ExtraStepsF); TM.Options.Reciprocals.setDefaults("sqrtd", UseRsqrt, ExtraStepsD); Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -368,9 +368,7 @@ } // All other insert/extracts cost this much. - if (ST->isKryo()) - return 2; - return 3; + return ST->getVectorInsertExtractBaseCost(); } int AArch64TTIImpl::getArithmeticInstrCost( @@ -529,9 +527,7 @@ } unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) { - if (ST->isCortexA57() || ST->isKryo()) - return 4; - return 2; + return ST->getMaxInterleaveFactor(); } void AArch64TTIImpl::getUnrollingPreferences(Loop *L, @@ -630,28 +626,17 @@ } unsigned AArch64TTIImpl::getCacheLineSize() { - if (ST->isCyclone()) - return 64; - return BaseT::getCacheLineSize(); + return ST->getCacheLineSize(); } unsigned AArch64TTIImpl::getPrefetchDistance() { - if (ST->isCyclone()) - return 280; - return BaseT::getPrefetchDistance(); + return ST->getPrefetchDistance(); } unsigned AArch64TTIImpl::getMinPrefetchStride() { - if (ST->isCyclone()) - // The HW prefetcher handles accesses with strides up to 2KB. - return 2048; - return BaseT::getMinPrefetchStride(); + return ST->getMinPrefetchStride(); } unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() { - if (ST->isCyclone()) - // Be conservative for now and don't prefetch ahead too much since the loop - // may terminate early. - return 3; - return BaseT::getMaxPrefetchIterationsAhead(); + return ST->getMaxPrefetchIterationsAhead(); }