Skip to content

Commit cd2be7f

Browse files
author
Chad Rosier
committedFeb 12, 2016
[AArch64] Add support for Qualcomm Kryo CPU.
Machine model description by Dave Estes <cestes@codeaurora.org>. llvm-svn: 260686
1 parent 852860e commit cd2be7f

11 files changed

+2509
-5
lines changed
 

‎llvm/lib/Target/AArch64/AArch64.td

+10
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ include "AArch64SchedA53.td"
9191
include "AArch64SchedA57.td"
9292
include "AArch64SchedCyclone.td"
9393
include "AArch64SchedM1.td"
94+
include "AArch64SchedKryo.td"
9495

9596
def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
9697
"Cortex-A35 ARM processors",
@@ -133,6 +134,14 @@ def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
133134
FeatureCRC,
134135
FeaturePerfMon]>;
135136

137+
def ProcKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
138+
"Qualcomm Kryo processors",
139+
[FeatureFPARMv8,
140+
FeatureNEON,
141+
FeatureCrypto,
142+
FeatureCRC,
143+
FeaturePerfMon]>;
144+
136145
def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8,
137146
FeatureNEON,
138147
FeatureCRC,
@@ -146,6 +155,7 @@ def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
146155
def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA57]>;
147156
def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
148157
def : ProcessorModel<"exynos-m1", ExynosM1Model, [ProcExynosM1]>;
158+
def : ProcessorModel<"kryo", KryoModel, [ProcKryo]>;
149159

150160
//===----------------------------------------------------------------------===//
151161
// Assembly parser

‎llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -637,7 +637,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
637637
}
638638

639639
// Prefer likely predicted branches to selects on out-of-order cores.
640-
if (Subtarget->isCortexA57())
640+
if (Subtarget->isCortexA57() || Subtarget->isKryo())
641641
PredictableSelectIsExpensive = true;
642642
}
643643

‎llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -543,7 +543,8 @@ static bool canBeExpandedToORR(const MachineInstr *MI, unsigned BitSize) {
543543
// FIXME: this implementation should be micro-architecture dependent, so a
544544
// micro-architecture target hook should be introduced here in future.
545545
bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
546-
if (!Subtarget.isCortexA57() && !Subtarget.isCortexA53())
546+
if (!Subtarget.isCortexA57() && !Subtarget.isCortexA53() &&
547+
!Subtarget.isKryo())
547548
return MI->isAsCheapAsAMove();
548549

549550
switch (MI->getOpcode()) {

‎llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -1969,7 +1969,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
19691969
}
19701970

19711971
bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) {
1972-
bool ProfitableArch = Subtarget->isCortexA57();
1972+
bool ProfitableArch = Subtarget->isCortexA57() || Subtarget->isKryo();
19731973
// FIXME: The benefit from converting narrow loads into a wider load could be
19741974
// microarchitectural as it assumes that a single load with two bitfield
19751975
// extracts is cheaper than two narrow loads. Currently, this conversion is
+130
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
//==- AArch64SchedKryo.td - Qualcomm Kryo Scheduling Defs ---*- tablegen -*-==//
2+
//
3+
// The LLVM Compiler Infrastructure
4+
//
5+
// This file is distributed under the University of Illinois Open Source
6+
// License. See LICENSE.TXT for details.
7+
//
8+
//===----------------------------------------------------------------------===//
9+
//
10+
// This file defines the machine model for Qualcomm Kryo to support
11+
// instruction scheduling and other instruction cost heuristics.
12+
//
13+
//===----------------------------------------------------------------------===//
14+
15+
//===----------------------------------------------------------------------===//
16+
// The issue width is set to five, matching the five issue queues for expanded
17+
// uops. Now, the latency spreadsheet has information based on fragmented uops,
18+
// but these do not actually take up an issue queue.
19+
20+
def KryoModel : SchedMachineModel {
21+
let IssueWidth = 5; // 5-wide issue for expanded uops
22+
let MicroOpBufferSize = 128; // Out-of-order with temporary unified issue buffer
23+
let LoadLatency = 4; // Optimistic load latency
24+
let MispredictPenalty = 14; // Fetch + Decode/Rename/Dispatch + Branch
25+
26+
// Enable partial & runtime unrolling. The magic number is chosen based on
27+
// experiments and benchmarking data.
28+
let LoopMicroOpBufferSize = 16;
29+
}
30+
31+
//===----------------------------------------------------------------------===//
32+
// Define each kind of processor resource and number available on Kryo.
33+
34+
let SchedModel = KryoModel in {
35+
def KryoUnitXA : ProcResource<1>; // Type X(A) micro-ops
36+
def KryoUnitXB : ProcResource<1>; // Type X(B) micro-ops
37+
def KryoUnitYA : ProcResource<1>; // Type Y(A) micro-ops
38+
def KryoUnitYB : ProcResource<1>; // Type Y(B) micro-ops
39+
def KryoUnitX : ProcResGroup<[KryoUnitXA, // Type X micro-ops
40+
KryoUnitXB]>;
41+
def KryoUnitY : ProcResGroup<[KryoUnitYA, // Type Y micro-ops
42+
KryoUnitYB]>;
43+
def KryoUnitXY : ProcResGroup<[KryoUnitXA, // Type XY micro-ops
44+
KryoUnitXB,
45+
KryoUnitYA,
46+
KryoUnitYB]>;
47+
def KryoUnitLSA : ProcResource<1>; // Type LS(A) micro-ops
48+
def KryoUnitLSB : ProcResource<1>; // Type LS(B) micro-ops
49+
def KryoUnitLS : ProcResGroup<[KryoUnitLSA, // Type LS micro-ops
50+
KryoUnitLSB]>;
51+
}
52+
53+
let SchedModel = KryoModel in {
54+
55+
//===----------------------------------------------------------------------===//
56+
// Map the target-defined scheduler read/write resources and latency for
57+
// Kryo.
58+
59+
def : WriteRes<WriteImm, [KryoUnitXY]> { let Latency = 1; }
60+
def : WriteRes<WriteI, [KryoUnitXY]> { let Latency = 1; }
61+
def : WriteRes<WriteISReg, [KryoUnitXY, KryoUnitXY]>
62+
{ let Latency = 2; let NumMicroOps = 2; }
63+
def : WriteRes<WriteIEReg, [KryoUnitXY, KryoUnitXY]>
64+
{ let Latency = 2; let NumMicroOps = 2; }
65+
def : WriteRes<WriteExtr, [KryoUnitXY, KryoUnitX]>
66+
{ let Latency = 2; let NumMicroOps = 2; }
67+
def : WriteRes<WriteIS, [KryoUnitXY]> { let Latency = 2; }
68+
def : WriteRes<WriteID32, [KryoUnitXA, KryoUnitY]>
69+
{ let Latency = 8; let NumMicroOps = 1; } // Fragent -1
70+
def : WriteRes<WriteID64, [KryoUnitXA, KryoUnitY]>
71+
{ let Latency = 8; let NumMicroOps = 1; } // Fragent -1
72+
def : WriteRes<WriteIM32, [KryoUnitX]> { let Latency = 5; }
73+
def : WriteRes<WriteIM64, [KryoUnitX]> { let Latency = 5; }
74+
def : WriteRes<WriteBr, [KryoUnitXY]> { let Latency = 1; }
75+
def : WriteRes<WriteBrReg, [KryoUnitXY]> { let Latency = 1; }
76+
def : WriteRes<WriteLD, [KryoUnitLS]> { let Latency = 4; }
77+
def : WriteRes<WriteST, [KryoUnitLS]> { let Latency = 4; }
78+
def : WriteRes<WriteSTP, [KryoUnitLS]> { let Latency = 4; }
79+
def : WriteRes<WriteAdr, [KryoUnitXY]> { let Latency = 6; }
80+
def : WriteRes<WriteLDIdx, [KryoUnitLS]> { let Latency = 4; }
81+
def : WriteRes<WriteSTIdx, [KryoUnitLS]> { let Latency = 4; }
82+
def : WriteRes<WriteF, [KryoUnitXY, KryoUnitXY]>
83+
{ let Latency = 3; let NumMicroOps = 2; }
84+
def : WriteRes<WriteFCmp, [KryoUnitXY]> { let Latency = 2; }
85+
def : WriteRes<WriteFCvt, [KryoUnitX]> { let Latency = 4; }
86+
def : WriteRes<WriteFCopy, [KryoUnitXY]> { let Latency = 6; }
87+
def : WriteRes<WriteFImm, [KryoUnitXY]> { let Latency = 6; }
88+
def : WriteRes<WriteFMul, [KryoUnitX, KryoUnitX]>
89+
{ let Latency = 6; let NumMicroOps = 2; }
90+
def : WriteRes<WriteFDiv, [KryoUnitXA, KryoUnitY]>
91+
{ let Latency = 12; let NumMicroOps = 2; } // Fragent -1 / NoRSV +1
92+
def : WriteRes<WriteV, [KryoUnitXY]> { let Latency = 6; }
93+
def : WriteRes<WriteVLD, [KryoUnitLS]> { let Latency = 4; }
94+
def : WriteRes<WriteVST, [KryoUnitLS]> { let Latency = 4; }
95+
96+
def : WriteRes<WriteSys, []> { let Latency = 1; }
97+
def : WriteRes<WriteBarrier, []> { let Latency = 1; }
98+
def : WriteRes<WriteHint, []> { let Latency = 1; }
99+
100+
def : WriteRes<WriteLDHi, []> { let Latency = 4; }
101+
102+
// No forwarding logic is modelled yet.
103+
def : ReadAdvance<ReadI, 0>;
104+
def : ReadAdvance<ReadISReg, 0>;
105+
def : ReadAdvance<ReadIEReg, 0>;
106+
def : ReadAdvance<ReadIM, 0>;
107+
def : ReadAdvance<ReadIMA, 0>;
108+
def : ReadAdvance<ReadID, 0>;
109+
def : ReadAdvance<ReadExtrHi, 0>;
110+
def : ReadAdvance<ReadAdrBase, 0>;
111+
def : ReadAdvance<ReadVLD, 0>;
112+
113+
114+
//===----------------------------------------------------------------------===//
115+
// Specialize the coarse model by associating instruction groups with the
116+
// subtarget-defined types. As the modeled is refined, this will override most
117+
// of the above SchedWriteRes and SchedAlias mappings.
118+
119+
// Miscellaneous
120+
// -----------------------------------------------------------------------------
121+
122+
def : InstRW<[WriteI], (instrs COPY)>;
123+
124+
125+
// Detailed Refinedments
126+
// -----------------------------------------------------------------------------
127+
include "AArch64SchedKryoDetails.td"
128+
129+
130+
} // SchedModel = KryoModel

‎llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td

+2,358
Large diffs are not rendered by default.

‎llvm/lib/Target/AArch64/AArch64Subtarget.h

+3-1
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,8 @@ class AArch64Subtarget : public AArch64GenSubtargetInfo {
3939
CortexA53,
4040
CortexA57,
4141
Cyclone,
42-
ExynosM1
42+
ExynosM1,
43+
Kryo
4344
};
4445

4546
/// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.
@@ -151,6 +152,7 @@ class AArch64Subtarget : public AArch64GenSubtargetInfo {
151152
bool isCortexA57() const { return CPUString == "cortex-a57"; }
152153
bool isCortexA53() const { return CPUString == "cortex-a53"; }
153154
bool isExynosM1() const { return CPUString == "exynos-m1"; }
155+
bool isKryo() const { return CPUString == "kryo"; }
154156

155157
bool useAA() const override { return isCortexA53(); }
156158

‎llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -472,7 +472,7 @@ int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
472472
}
473473

474474
unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
475-
if (ST->isCortexA57())
475+
if (ST->isCortexA57() || ST->isKryo())
476476
return 4;
477477
return 2;
478478
}

‎llvm/test/CodeGen/AArch64/arm64-narrow-ldst-merge.ll

+1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
; RUN: llc < %s -mtriple aarch64--none-eabi -mcpu=cortex-a57 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=LE
22
; RUN: llc < %s -mtriple aarch64_be--none-eabi -mcpu=cortex-a57 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=BE
3+
; RUN: llc < %s -mtriple aarch64--none-eabi -mcpu=kryo -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=LE
34

45
; CHECK-LABEL: Ldrh_merge
56
; CHECK-NOT: ldrh

‎llvm/test/CodeGen/AArch64/cpus.ll

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a57 2>&1 | FileCheck %s
88
; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a72 2>&1 | FileCheck %s
99
; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=exynos-m1 2>&1 | FileCheck %s
10+
; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=kryo 2>&1 | FileCheck %s
1011
; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
1112

1213
; CHECK-NOT: {{.*}} is not a recognized processor for this target

‎llvm/test/CodeGen/AArch64/remat.ll

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a53 -o - %s | FileCheck %s
44
; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a72 -o - %s | FileCheck %s
55
; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=exynos-m1 -o - %s | FileCheck %s
6+
; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=kryo -o - %s | FileCheck %s
67

78
%X = type { i64, i64, i64 }
89
declare void @f(%X*)

0 commit comments

Comments
 (0)
Please sign in to comment.