Skip to content

Commit 651cff4

Browse files
committedJun 2, 2016
AArch64: Do not test for CPUs, use SubtargetFeatures
Testing for specific CPUs has a number of problems, better use subtarget features: - When some tweak is added for a specific CPU it is often desirable for the next version of that CPU as well, yet we often forget to add it. - It is hard to keep track of checks scattered around the target code; Declaring all target specifics together with the CPU in the tablegen file is a clear representation. - Subtarget features can be tweaked from the command line. To discourage people from using CPU checks in the future I removed the isCortexXX(), isCyclone(), ... functions. I added an getProcFamily() function for exceptional circumstances but made it clear in the comment that usage is discouraged. Reformat feature list in AArch64.td to have 1 feature per line in alphabetical order to simplify merging and sorting for out of tree tweaks. No functional change intended. Differential Revision: http://reviews.llvm.org/D20762 llvm-svn: 271555
1 parent 5c0bc02 commit 651cff4

10 files changed

+224
-115
lines changed
 

‎llvm/lib/Target/AArch64/AArch64.td

+108-34
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,50 @@ def FeatureReserveX18 : SubtargetFeature<"reserve-x18", "ReserveX18", "true",
5858
"Reserve X18, making it unavailable "
5959
"as a GPR">;
6060

61+
def FeatureMergeNarrowLd : SubtargetFeature<"merge-narrow-ld",
62+
"MergeNarrowLoads", "true",
63+
"Merge narrow load instructions">;
64+
65+
def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true",
66+
"Use alias analysis during codegen">;
67+
68+
def FeatureBalanceFPOps : SubtargetFeature<"balance-fp-ops", "BalanceFPOps",
69+
"true",
70+
"balance mix of odd and even D-registers for fp multiply(-accumulate) ops">;
71+
72+
def FeaturePredictableSelectIsExpensive : SubtargetFeature<
73+
"predictable-select-expensive", "PredictableSelectIsExpensive", "true",
74+
"Prefer likely predicted branches over selects">;
75+
76+
def FeatureCustomCheapAsMoveHandling : SubtargetFeature<"custom-cheap-as-move",
77+
"CustomAsCheapAsMove", "true",
78+
"Use custom code for TargetInstrInfo::isAsCheapAsAMove()">;
79+
80+
def FeaturePostRAScheduler : SubtargetFeature<"use-postra-scheduler",
81+
"UsePostRAScheduler", "true", "Schedule again after register allocation">;
82+
83+
def FeatureSlowMisaligned128Store : SubtargetFeature<"slow-misaligned-128store",
84+
"Misaligned128StoreIsSlow", "true", "Misaligned 128 bit stores are slow">;
85+
86+
def FeatureAvoidQuadLdStPairs : SubtargetFeature<"no-quad-ldst-pairs",
87+
"AvoidQuadLdStPairs", "true",
88+
"Do not form quad load/store pair operations">;
89+
90+
def FeatureAlternateSExtLoadCVTF32Pattern : SubtargetFeature<
91+
"alternate-sextload-cvt-f32-pattern", "UseAlternateSExtLoadCVTF32Pattern",
92+
"true", "Use alternative pattern for sextload convert to f32">;
93+
94+
def FeatureMacroOpFusion : SubtargetFeature<
95+
"macroop-fusion", "HasMacroOpFusion", "true",
96+
"CPU supports macro op fusion">;
97+
98+
def FeatureDisableLatencySchedHeuristic : SubtargetFeature<
99+
"disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true",
100+
"Disable latency scheduling heuristic">;
101+
102+
def FeatureUseRSqrt : SubtargetFeature<
103+
"use-reverse-square-root", "UseRSqrt", "true", "Use reverse square root">;
104+
61105
//===----------------------------------------------------------------------===//
62106
// Architectures.
63107
//
@@ -94,57 +138,87 @@ include "AArch64SchedM1.td"
94138
include "AArch64SchedKryo.td"
95139

96140
def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
97-
"Cortex-A35 ARM processors",
98-
[FeatureFPARMv8,
99-
FeatureNEON,
100-
FeatureCrypto,
141+
"Cortex-A35 ARM processors", [
101142
FeatureCRC,
102-
FeaturePerfMon]>;
143+
FeatureCrypto,
144+
FeatureFPARMv8,
145+
FeatureNEON,
146+
FeaturePerfMon
147+
]>;
103148

104149
def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
105-
"Cortex-A53 ARM processors",
106-
[FeatureFPARMv8,
107-
FeatureNEON,
108-
FeatureCrypto,
150+
"Cortex-A53 ARM processors", [
151+
FeatureBalanceFPOps,
109152
FeatureCRC,
110-
FeaturePerfMon]>;
153+
FeatureCrypto,
154+
FeatureCustomCheapAsMoveHandling,
155+
FeatureFPARMv8,
156+
FeatureNEON,
157+
FeaturePerfMon,
158+
FeaturePostRAScheduler,
159+
FeatureUseAA
160+
]>;
111161

112162
def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
113-
"Cortex-A57 ARM processors",
114-
[FeatureFPARMv8,
115-
FeatureNEON,
116-
FeatureCrypto,
163+
"Cortex-A57 ARM processors", [
164+
FeatureBalanceFPOps,
117165
FeatureCRC,
118-
FeaturePerfMon]>;
166+
FeatureCrypto,
167+
FeatureCustomCheapAsMoveHandling,
168+
FeatureFPARMv8,
169+
FeatureMergeNarrowLd,
170+
FeatureNEON,
171+
FeaturePerfMon,
172+
FeaturePostRAScheduler,
173+
FeaturePredictableSelectIsExpensive
174+
]>;
119175

120176
def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
121-
"Cyclone",
122-
[FeatureFPARMv8,
123-
FeatureNEON,
177+
"Cyclone", [
178+
FeatureAlternateSExtLoadCVTF32Pattern,
124179
FeatureCrypto,
180+
FeatureDisableLatencySchedHeuristic,
181+
FeatureFPARMv8,
182+
FeatureMacroOpFusion,
183+
FeatureNEON,
125184
FeaturePerfMon,
126-
FeatureZCRegMove, FeatureZCZeroing]>;
185+
FeatureSlowMisaligned128Store,
186+
FeatureZCRegMove,
187+
FeatureZCZeroing
188+
]>;
127189

128190
def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
129-
"Samsung Exynos-M1 processors",
130-
[FeatureFPARMv8,
131-
FeatureNEON,
132-
FeatureCrypto,
191+
"Samsung Exynos-M1 processors", [
192+
FeatureAvoidQuadLdStPairs,
133193
FeatureCRC,
134-
FeaturePerfMon]>;
194+
FeatureCrypto,
195+
FeatureCustomCheapAsMoveHandling,
196+
FeatureFPARMv8,
197+
FeatureNEON,
198+
FeaturePerfMon,
199+
FeatureUseRSqrt
200+
]>;
135201

136202
def ProcKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
137-
"Qualcomm Kryo processors",
138-
[FeatureFPARMv8,
139-
FeatureNEON,
140-
FeatureCrypto,
203+
"Qualcomm Kryo processors", [
141204
FeatureCRC,
142-
FeaturePerfMon]>;
143-
144-
def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8,
145-
FeatureNEON,
146-
FeatureCRC,
147-
FeaturePerfMon]>;
205+
FeatureCrypto,
206+
FeatureCustomCheapAsMoveHandling,
207+
FeatureFPARMv8,
208+
FeatureMergeNarrowLd,
209+
FeatureNEON,
210+
FeaturePerfMon,
211+
FeaturePostRAScheduler,
212+
FeaturePredictableSelectIsExpensive
213+
]>;
214+
215+
def : ProcessorModel<"generic", NoSchedModel, [
216+
FeatureCRC,
217+
FeatureFPARMv8,
218+
FeatureNEON,
219+
FeaturePerfMon,
220+
FeaturePostRAScheduler
221+
]>;
148222

149223
// FIXME: Cortex-A35 is currently modelled as a Cortex-A53
150224
def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>;

‎llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp

+1-3
Original file line numberDiff line numberDiff line change
@@ -314,9 +314,7 @@ bool AArch64A57FPLoadBalancing::runOnMachineFunction(MachineFunction &F) {
314314
if (skipFunction(*F.getFunction()))
315315
return false;
316316

317-
// Don't do anything if this isn't an A53 or A57.
318-
if (!(F.getSubtarget<AArch64Subtarget>().isCortexA53() ||
319-
F.getSubtarget<AArch64Subtarget>().isCortexA57()))
317+
if (!F.getSubtarget<AArch64Subtarget>().balanceFPOps())
320318
return false;
321319

322320
bool Changed = false;

‎llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+4-11
Original file line numberDiff line numberDiff line change
@@ -634,9 +634,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
634634
}
635635
}
636636

637-
// Prefer likely predicted branches to selects on out-of-order cores.
638-
if (Subtarget->isCortexA57() || Subtarget->isKryo())
639-
PredictableSelectIsExpensive = true;
637+
PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
640638
}
641639

642640
void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
@@ -814,12 +812,9 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
814812
if (Subtarget->requiresStrictAlign())
815813
return false;
816814

817-
// FIXME: This is mostly true for Cyclone, but not necessarily others.
818815
if (Fast) {
819-
// FIXME: Define an attribute for slow unaligned accesses instead of
820-
// relying on the CPU type as a proxy.
821-
// On Cyclone, unaligned 128-bit stores are slow.
822-
*Fast = !Subtarget->isCyclone() || VT.getStoreSize() != 16 ||
816+
// Some CPUs are fine with unaligned stores except for 128-bit ones.
817+
*Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
823818
// See comments in performSTORECombine() for more details about
824819
// these conditions.
825820

@@ -8792,9 +8787,7 @@ static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
87928787
// be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
87938788
// a call to that function here.
87948789

8795-
// Cyclone has bad performance on unaligned 16B stores when crossing line and
8796-
// page boundaries. We want to split such stores.
8797-
if (!Subtarget->isCyclone())
8790+
if (!Subtarget->isMisaligned128StoreSlow())
87988791
return SDValue();
87998792

88008793
// Don't split at -Oz.

‎llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

+9-10
Original file line numberDiff line numberDiff line change
@@ -544,8 +544,7 @@ static bool canBeExpandedToORR(const MachineInstr *MI, unsigned BitSize) {
544544
// FIXME: this implementation should be micro-architecture dependent, so a
545545
// micro-architecture target hook should be introduced here in future.
546546
bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
547-
if (!Subtarget.isCortexA57() && !Subtarget.isCortexA53() &&
548-
!Subtarget.isExynosM1() && !Subtarget.isKryo())
547+
if (!Subtarget.hasCustomCheapAsMoveHandling())
549548
return MI->isAsCheapAsAMove();
550549

551550
unsigned Imm;
@@ -559,7 +558,7 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
559558
case AArch64::ADDXri:
560559
case AArch64::SUBWri:
561560
case AArch64::SUBXri:
562-
return (Subtarget.isExynosM1() ||
561+
return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 ||
563562
MI->getOperand(3).getImm() == 0);
564563

565564
// add/sub on register with shift
@@ -568,7 +567,7 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
568567
case AArch64::SUBWrs:
569568
case AArch64::SUBXrs:
570569
Imm = MI->getOperand(3).getImm();
571-
return (Subtarget.isExynosM1() &&
570+
return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 &&
572571
AArch64_AM::getArithShiftValue(Imm) < 4);
573572

574573
// logical ops on immediate
@@ -609,7 +608,7 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
609608
case AArch64::ORRWrs:
610609
case AArch64::ORRXrs:
611610
Imm = MI->getOperand(3).getImm();
612-
return (Subtarget.isExynosM1() &&
611+
return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 &&
613612
AArch64_AM::getShiftValue(Imm) < 4 &&
614613
AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL);
615614

@@ -1522,8 +1521,8 @@ bool AArch64InstrInfo::isCandidateToMergeOrPair(MachineInstr *MI) const {
15221521
if (isLdStPairSuppressed(MI))
15231522
return false;
15241523

1525-
// Do not pair quad ld/st for Exynos.
1526-
if (Subtarget.isExynosM1()) {
1524+
// On some CPUs quad load/store pairs are slower than two single load/stores.
1525+
if (Subtarget.avoidQuadLdStPairs()) {
15271526
switch (MI->getOpcode()) {
15281527
default:
15291528
break;
@@ -1801,8 +1800,8 @@ bool AArch64InstrInfo::shouldClusterMemOps(MachineInstr *FirstLdSt,
18011800

18021801
bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
18031802
MachineInstr *Second) const {
1804-
if (Subtarget.isCyclone()) {
1805-
// Cyclone can fuse CMN, CMP, TST followed by Bcc.
1803+
if (Subtarget.hasMacroOpFusion()) {
1804+
// Fuse CMN, CMP, TST followed by Bcc.
18061805
unsigned SecondOpcode = Second->getOpcode();
18071806
if (SecondOpcode == AArch64::Bcc) {
18081807
switch (First->getOpcode()) {
@@ -1817,7 +1816,7 @@ bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
18171816
return true;
18181817
}
18191818
}
1820-
// Cyclone B0 also supports ALU operations followed by CBZ/CBNZ.
1819+
// Fuse ALU operations followed by CBZ/CBNZ.
18211820
if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX ||
18221821
SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) {
18231822
switch (First->getOpcode()) {

‎llvm/lib/Target/AArch64/AArch64InstrInfo.td

+6-3
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,8 @@ def HasSPE : Predicate<"Subtarget->hasSPE()">,
3434

3535
def IsLE : Predicate<"Subtarget->isLittleEndian()">;
3636
def IsBE : Predicate<"!Subtarget->isLittleEndian()">;
37-
def IsCyclone : Predicate<"Subtarget->isCyclone()">;
37+
def UseAlternateSExtLoadCVTF32
38+
: Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">;
3839

3940
//===----------------------------------------------------------------------===//
4041
// AArch64-specific DAG Nodes.
@@ -4957,7 +4958,8 @@ class SExtLoadi8CVTf32Pat<dag addrmode, dag INST>
49574958
0),
49584959
dsub)),
49594960
0),
4960-
ssub)))>, Requires<[NotForCodeSize, IsCyclone]>;
4961+
ssub)))>,
4962+
Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;
49614963

49624964
def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext),
49634965
(LDRBroW GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>;
@@ -5010,7 +5012,8 @@ class SExtLoadi16CVTf64Pat<dag addrmode, dag INST>
50105012
0),
50115013
dsub)),
50125014
0),
5013-
dsub)))>, Requires<[NotForCodeSize, IsCyclone]>;
5015+
dsub)))>,
5016+
Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;
50145017

50155018
def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
50165019
(LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;

‎llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp

+2-14
Original file line numberDiff line numberDiff line change
@@ -160,10 +160,6 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
160160
// Find and promote load instructions which read directly from store.
161161
bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
162162

163-
// Check if converting two narrow loads into a single wider load with
164-
// bitfield extracts could be enabled.
165-
bool enableNarrowLdMerge(MachineFunction &Fn);
166-
167163
bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt);
168164

169165
bool runOnMachineFunction(MachineFunction &Fn) override;
@@ -1912,15 +1908,6 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
19121908
return Modified;
19131909
}
19141910

1915-
bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) {
1916-
bool ProfitableArch = Subtarget->isCortexA57() || Subtarget->isKryo();
1917-
// FIXME: The benefit from converting narrow loads into a wider load could be
1918-
// microarchitectural as it assumes that a single load with two bitfield
1919-
// extracts is cheaper than two narrow loads. Currently, this conversion is
1920-
// enabled only in cortex-a57 on which performance benefits were verified.
1921-
return ProfitableArch && !Subtarget->requiresStrictAlign();
1922-
}
1923-
19241911
bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
19251912
if (skipFunction(*Fn.getFunction()))
19261913
return false;
@@ -1936,7 +1923,8 @@ bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
19361923
UsedRegs.resize(TRI->getNumRegs());
19371924

19381925
bool Modified = false;
1939-
bool enableNarrowLdOpt = enableNarrowLdMerge(Fn);
1926+
bool enableNarrowLdOpt =
1927+
Subtarget->mergeNarrowLoads() && !Subtarget->requiresStrictAlign();
19401928
for (auto &MBB : Fn)
19411929
Modified |= optimizeBlock(MBB, enableNarrowLdOpt);
19421930

‎llvm/lib/Target/AArch64/AArch64Subtarget.cpp

+29-6
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,36 @@ AArch64Subtarget::initializeSubtargetDependencies(StringRef FS) {
4444
CPUString = "generic";
4545

4646
ParseSubtargetFeatures(CPUString, FS);
47+
initializeProperties();
48+
4749
return *this;
4850
}
4951

52+
void AArch64Subtarget::initializeProperties() {
53+
// Initialize CPU specific properties. We should add a tablegen feature for
54+
// this in the future so we can specify it together with the subtarget
55+
// features.
56+
switch (ARMProcFamily) {
57+
case Cyclone:
58+
CacheLineSize = 64;
59+
PrefetchDistance = 280;
60+
MinPrefetchStride = 2048;
61+
MaxPrefetchIterationsAhead = 3;
62+
break;
63+
case CortexA57:
64+
MaxInterleaveFactor = 4;
65+
break;
66+
case Kryo:
67+
MaxInterleaveFactor = 4;
68+
VectorInsertExtractBaseCost = 2;
69+
break;
70+
case Others: break;
71+
case CortexA35: break;
72+
case CortexA53: break;
73+
case ExynosM1: break;
74+
}
75+
}
76+
5077
AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
5178
const std::string &FS,
5279
const TargetMachine &TM, bool LittleEndian)
@@ -110,8 +137,7 @@ void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
110137
// Enabling or Disabling the latency heuristic is a close call: It seems to
111138
// help nearly no benchmark on out-of-order architectures, on the other hand
112139
// it regresses register pressure on a few benchmarking.
113-
if (isCyclone())
114-
Policy.DisableLatencyHeuristic = true;
140+
Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
115141
}
116142

117143
bool AArch64Subtarget::enableEarlyIfConversion() const {
@@ -133,8 +159,5 @@ bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
133159

134160
std::unique_ptr<PBQPRAConstraint>
135161
AArch64Subtarget::getCustomPBQPConstraints() const {
136-
if (!isCortexA57())
137-
return nullptr;
138-
139-
return llvm::make_unique<A57ChainingConstraint>();
162+
return balanceFPOps() ? llvm::make_unique<A57ChainingConstraint>() : nullptr;
140163
}

‎llvm/lib/Target/AArch64/AArch64Subtarget.h

+58-11
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ class StringRef;
3333
class Triple;
3434

3535
class AArch64Subtarget : public AArch64GenSubtargetInfo {
36-
protected:
37-
enum ARMProcFamilyEnum {
36+
public:
37+
enum ARMProcFamilyEnum : uint8_t {
3838
Others,
3939
CortexA35,
4040
CortexA53,
@@ -44,6 +44,7 @@ class AArch64Subtarget : public AArch64GenSubtargetInfo {
4444
Kryo
4545
};
4646

47+
protected:
4748
/// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.
4849
ARMProcFamilyEnum ARMProcFamily = Others;
4950

@@ -66,6 +67,24 @@ class AArch64Subtarget : public AArch64GenSubtargetInfo {
6667

6768
// StrictAlign - Disallow unaligned memory accesses.
6869
bool StrictAlign = false;
70+
bool MergeNarrowLoads = false;
71+
bool UseAA = false;
72+
bool PredictableSelectIsExpensive = false;
73+
bool BalanceFPOps = false;
74+
bool CustomAsCheapAsMove = false;
75+
bool UsePostRAScheduler = false;
76+
bool Misaligned128StoreIsSlow = false;
77+
bool AvoidQuadLdStPairs = false;
78+
bool UseAlternateSExtLoadCVTF32Pattern = false;
79+
bool HasMacroOpFusion = false;
80+
bool DisableLatencySchedHeuristic = false;
81+
bool UseRSqrt = false;
82+
uint8_t MaxInterleaveFactor = 2;
83+
uint8_t VectorInsertExtractBaseCost = 3;
84+
uint16_t CacheLineSize = 0;
85+
uint16_t PrefetchDistance = 0;
86+
uint16_t MinPrefetchStride = 1;
87+
unsigned MaxPrefetchIterationsAhead = UINT_MAX;
6988

7089
// ReserveX18 - X18 is not available as a general purpose register.
7190
bool ReserveX18;
@@ -93,6 +112,9 @@ class AArch64Subtarget : public AArch64GenSubtargetInfo {
93112
/// subtarget initialization.
94113
AArch64Subtarget &initializeSubtargetDependencies(StringRef FS);
95114

115+
/// Initialize properties based on the selected processor family.
116+
void initializeProperties();
117+
96118
public:
97119
/// This constructor initializes the data members to match that
98120
/// of the specified triple.
@@ -123,7 +145,15 @@ class AArch64Subtarget : public AArch64GenSubtargetInfo {
123145
const Triple &getTargetTriple() const { return TargetTriple; }
124146
bool enableMachineScheduler() const override { return true; }
125147
bool enablePostRAScheduler() const override {
126-
return isGeneric() || isCortexA53() || isCortexA57() || isKryo();
148+
return UsePostRAScheduler;
149+
}
150+
151+
/// Returns ARM processor family.
152+
/// Avoid this function! CPU specifics should be kept local to this class
153+
/// and preferably modeled with SubtargetFeatures or properties in
154+
/// initializeProperties().
155+
ARMProcFamilyEnum getProcFamily() const {
156+
return ARMProcFamily;
127157
}
128158

129159
bool hasV8_1aOps() const { return HasV8_1aOps; }
@@ -140,6 +170,30 @@ class AArch64Subtarget : public AArch64GenSubtargetInfo {
140170
bool hasNEON() const { return HasNEON; }
141171
bool hasCrypto() const { return HasCrypto; }
142172
bool hasCRC() const { return HasCRC; }
173+
bool mergeNarrowLoads() const { return MergeNarrowLoads; }
174+
bool balanceFPOps() const { return BalanceFPOps; }
175+
bool predictableSelectIsExpensive() const {
176+
return PredictableSelectIsExpensive;
177+
}
178+
bool hasCustomCheapAsMoveHandling() const { return CustomAsCheapAsMove; }
179+
bool isMisaligned128StoreSlow() const { return Misaligned128StoreIsSlow; }
180+
bool avoidQuadLdStPairs() const { return AvoidQuadLdStPairs; }
181+
bool useAlternateSExtLoadCVTF32Pattern() const {
182+
return UseAlternateSExtLoadCVTF32Pattern;
183+
}
184+
bool hasMacroOpFusion() const { return HasMacroOpFusion; }
185+
bool useRSqrt() const { return UseRSqrt; }
186+
unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
187+
unsigned getVectorInsertExtractBaseCost() const {
188+
return VectorInsertExtractBaseCost;
189+
}
190+
unsigned getCacheLineSize() const { return CacheLineSize; }
191+
unsigned getPrefetchDistance() const { return PrefetchDistance; }
192+
unsigned getMinPrefetchStride() const { return MinPrefetchStride; }
193+
unsigned getMaxPrefetchIterationsAhead() const {
194+
return MaxPrefetchIterationsAhead;
195+
}
196+
143197
/// CPU has TBI (top byte of addresses is ignored during HW address
144198
/// translation) and OS enables it.
145199
bool supportsAddressTopByteIgnored() const;
@@ -160,14 +214,7 @@ class AArch64Subtarget : public AArch64GenSubtargetInfo {
160214
bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
161215
bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
162216

163-
bool isGeneric() const { return CPUString == "generic"; }
164-
bool isCyclone() const { return CPUString == "cyclone"; }
165-
bool isCortexA57() const { return CPUString == "cortex-a57"; }
166-
bool isCortexA53() const { return CPUString == "cortex-a53"; }
167-
bool isExynosM1() const { return CPUString == "exynos-m1"; }
168-
bool isKryo() const { return CPUString == "kryo"; }
169-
170-
bool useAA() const override { return isCortexA53(); }
217+
bool useAA() const override { return UseAA; }
171218

172219
/// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
173220
/// that still makes it profitable to inline the call.

‎llvm/lib/Target/AArch64/AArch64TargetMachine.cpp

+1-2
Original file line numberDiff line numberDiff line change
@@ -147,8 +147,7 @@ static void initReciprocals(AArch64TargetMachine& TM, AArch64Subtarget& ST)
147147
// (52 mantissa bits) are 2 and 3, respectively.
148148
unsigned ExtraStepsF = 2,
149149
ExtraStepsD = ExtraStepsF + 1;
150-
// FIXME: Enable x^-1/2 only for Exynos M1 at the moment.
151-
bool UseRsqrt = ST.isExynosM1();
150+
bool UseRsqrt = ST.useRSqrt();
152151

153152
TM.Options.Reciprocals.setDefaults("sqrtf", UseRsqrt, ExtraStepsF);
154153
TM.Options.Reciprocals.setDefaults("sqrtd", UseRsqrt, ExtraStepsD);

‎llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

+6-21
Original file line numberDiff line numberDiff line change
@@ -368,9 +368,7 @@ int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
368368
}
369369

370370
// All other insert/extracts cost this much.
371-
if (ST->isKryo())
372-
return 2;
373-
return 3;
371+
return ST->getVectorInsertExtractBaseCost();
374372
}
375373

376374
int AArch64TTIImpl::getArithmeticInstrCost(
@@ -529,9 +527,7 @@ int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
529527
}
530528

531529
unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
532-
if (ST->isCortexA57() || ST->isKryo())
533-
return 4;
534-
return 2;
530+
return ST->getMaxInterleaveFactor();
535531
}
536532

537533
void AArch64TTIImpl::getUnrollingPreferences(Loop *L,
@@ -630,28 +626,17 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
630626
}
631627

632628
unsigned AArch64TTIImpl::getCacheLineSize() {
633-
if (ST->isCyclone())
634-
return 64;
635-
return BaseT::getCacheLineSize();
629+
return ST->getCacheLineSize();
636630
}
637631

638632
unsigned AArch64TTIImpl::getPrefetchDistance() {
639-
if (ST->isCyclone())
640-
return 280;
641-
return BaseT::getPrefetchDistance();
633+
return ST->getPrefetchDistance();
642634
}
643635

644636
unsigned AArch64TTIImpl::getMinPrefetchStride() {
645-
if (ST->isCyclone())
646-
// The HW prefetcher handles accesses with strides up to 2KB.
647-
return 2048;
648-
return BaseT::getMinPrefetchStride();
637+
return ST->getMinPrefetchStride();
649638
}
650639

651640
unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() {
652-
if (ST->isCyclone())
653-
// Be conservative for now and don't prefetch ahead too much since the loop
654-
// may terminate early.
655-
return 3;
656-
return BaseT::getMaxPrefetchIterationsAhead();
641+
return ST->getMaxPrefetchIterationsAhead();
657642
}

0 commit comments

Comments
 (0)
Please sign in to comment.