Skip to content

Commit 75d76e5

Browse files
committedJun 28, 2017
[ARM] Improve if-conversion for M-class CPUs without branch predictors
The current heuristic in isProfitableToIfCvt assumes we have a branch predictor, and so gives the wrong answer in some cases when we don't. This patch adds a subtarget feature to indicate that a subtarget has no branch predictor, and changes the heuristic in isProfitableToiIfCvt when it's present. This gives a slight overall improvement in a set of embedded benchmarks on Cortex-M4 and Cortex-M33. Differential Revision: https://reviews.llvm.org/D34398 llvm-svn: 306547
1 parent 48b30c3 commit 75d76e5

File tree

6 files changed

+239
-14
lines changed

6 files changed

+239
-14
lines changed
 

‎llvm/lib/Target/ARM/ARM.td

+20-6
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,13 @@ def FeatureAvoidMOVsShOp : SubtargetFeature<"avoid-movs-shop",
222222
def FeatureHasRetAddrStack : SubtargetFeature<"ret-addr-stack", "HasRetAddrStack", "true",
223223
"Has return address stack">;
224224

225+
// Some processors have no branch predictor, which changes the expected cost of
226+
// taking a branch which affects the choice of whether to use predicated
227+
// instructions.
228+
def FeatureHasNoBranchPredictor : SubtargetFeature<"no-branch-predictor",
229+
"HasBranchPredictor", "false",
230+
"Has no branch predictor">;
231+
225232
/// DSP extension.
226233
def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true",
227234
"Supports DSP instructions in ARM and/or Thumb2">;
@@ -756,13 +763,19 @@ def : ProcessorModel<"cortex-r8", CortexA8Model, [ARMv7r,
756763
FeatureHasSlowFPVMLx,
757764
FeatureAvoidPartialCPSR]>;
758765

759-
def : ProcNoItin<"cortex-m3", [ARMv7m, ProcM3]>;
760-
def : ProcNoItin<"sc300", [ARMv7m, ProcM3]>;
766+
def : ProcessorModel<"cortex-m3", CortexM3Model, [ARMv7m,
767+
ProcM3,
768+
FeatureHasNoBranchPredictor]>;
769+
770+
def : ProcessorModel<"sc300", CortexM3Model, [ARMv7m,
771+
ProcM3,
772+
FeatureHasNoBranchPredictor]>;
761773

762-
def : ProcNoItin<"cortex-m4", [ARMv7em,
774+
def : ProcessorModel<"cortex-m4", CortexM3Model, [ARMv7em,
763775
FeatureVFP4,
764776
FeatureVFPOnlySP,
765-
FeatureD16]>;
777+
FeatureD16,
778+
FeatureHasNoBranchPredictor]>;
766779

767780
def : ProcNoItin<"cortex-m7", [ARMv7em,
768781
FeatureFPARMv8,
@@ -771,11 +784,12 @@ def : ProcNoItin<"cortex-m7", [ARMv7em,
771784
def : ProcNoItin<"cortex-m23", [ARMv8mBaseline,
772785
FeatureNoMovt]>;
773786

774-
def : ProcNoItin<"cortex-m33", [ARMv8mMainline,
787+
def : ProcessorModel<"cortex-m33", CortexM3Model, [ARMv8mMainline,
775788
FeatureDSP,
776789
FeatureFPARMv8,
777790
FeatureD16,
778-
FeatureVFPOnlySP]>;
791+
FeatureVFPOnlySP,
792+
FeatureHasNoBranchPredictor]>;
779793

780794
def : ProcNoItin<"cortex-a32", [ARMv8a,
781795
FeatureHWDivThumb,

‎llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp

+37-8
Original file line numberDiff line numberDiff line change
@@ -1851,9 +1851,9 @@ isProfitableToIfCvt(MachineBasicBlock &MBB,
18511851
}
18521852

18531853
bool ARMBaseInstrInfo::
1854-
isProfitableToIfCvt(MachineBasicBlock &,
1854+
isProfitableToIfCvt(MachineBasicBlock &TBB,
18551855
unsigned TCycles, unsigned TExtra,
1856-
MachineBasicBlock &,
1856+
MachineBasicBlock &FBB,
18571857
unsigned FCycles, unsigned FExtra,
18581858
BranchProbability Probability) const {
18591859
if (!TCycles)
@@ -1863,14 +1863,43 @@ isProfitableToIfCvt(MachineBasicBlock &,
18631863
// Here we scale up each component of UnpredCost to avoid precision issue when
18641864
// scaling TCycles/FCycles by Probability.
18651865
const unsigned ScalingUpFactor = 1024;
1866-
unsigned TUnpredCost = Probability.scale(TCycles * ScalingUpFactor);
1867-
unsigned FUnpredCost =
1866+
1867+
unsigned PredCost = (TCycles + FCycles + TExtra + FExtra) * ScalingUpFactor;
1868+
unsigned UnpredCost;
1869+
if (!Subtarget.hasBranchPredictor()) {
1870+
// When we don't have a branch predictor it's always cheaper to not take a
1871+
// branch than take it, so we have to take that into account.
1872+
unsigned NotTakenBranchCost = 1;
1873+
unsigned TakenBranchCost = Subtarget.getMispredictionPenalty();
1874+
unsigned TUnpredCycles, FUnpredCycles;
1875+
if (!FCycles) {
1876+
// Triangle: TBB is the fallthrough
1877+
TUnpredCycles = TCycles + NotTakenBranchCost;
1878+
FUnpredCycles = TakenBranchCost;
1879+
} else {
1880+
// Diamond: TBB is the block that is branched to, FBB is the fallthrough
1881+
TUnpredCycles = TCycles + TakenBranchCost;
1882+
FUnpredCycles = FCycles + NotTakenBranchCost;
1883+
}
1884+
// The total cost is the cost of each path scaled by their probabilites
1885+
unsigned TUnpredCost = Probability.scale(TUnpredCycles * ScalingUpFactor);
1886+
unsigned FUnpredCost = Probability.getCompl().scale(FUnpredCycles * ScalingUpFactor);
1887+
UnpredCost = TUnpredCost + FUnpredCost;
1888+
// When predicating assume that the first IT can be folded away but later
1889+
// ones cost one cycle each
1890+
if (Subtarget.isThumb2() && TCycles + FCycles > 4) {
1891+
PredCost += ((TCycles + FCycles - 4) / 4) * ScalingUpFactor;
1892+
}
1893+
} else {
1894+
unsigned TUnpredCost = Probability.scale(TCycles * ScalingUpFactor);
1895+
unsigned FUnpredCost =
18681896
Probability.getCompl().scale(FCycles * ScalingUpFactor);
1869-
unsigned UnpredCost = TUnpredCost + FUnpredCost;
1870-
UnpredCost += 1 * ScalingUpFactor; // The branch itself
1871-
UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10;
1897+
UnpredCost = TUnpredCost + FUnpredCost;
1898+
UnpredCost += 1 * ScalingUpFactor; // The branch itself
1899+
UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10;
1900+
}
18721901

1873-
return (TCycles + FCycles + TExtra + FExtra) * ScalingUpFactor <= UnpredCost;
1902+
return PredCost <= UnpredCost;
18741903
}
18751904

18761905
bool

‎llvm/lib/Target/ARM/ARMSchedule.td

+1
Original file line numberDiff line numberDiff line change
@@ -424,3 +424,4 @@ include "ARMScheduleA9.td"
424424
include "ARMScheduleSwift.td"
425425
include "ARMScheduleR52.td"
426426
include "ARMScheduleA57.td"
427+
include "ARMScheduleM3.td"

‎llvm/lib/Target/ARM/ARMScheduleM3.td

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
//=- ARMScheduleM3.td - ARM Cortex-M3 Scheduling Definitions -*- tablegen -*-=//
2+
//
3+
// The LLVM Compiler Infrastructure
4+
//
5+
// This file is distributed under the University of Illinois Open Source
6+
// License. See LICENSE.TXT for details.
7+
//
8+
//===----------------------------------------------------------------------===//
9+
//
10+
// This file defines the machine model for the ARM Cortex-M3 processor.
11+
//
12+
//===----------------------------------------------------------------------===//
13+
14+
def CortexM3Model : SchedMachineModel {
15+
let IssueWidth = 1; // Only IT can be dual-issued, so assume single-issue
16+
let MicroOpBufferSize = 0; // In-order
17+
let LoadLatency = 2; // Latency when not pipelined, not pc-relative
18+
let MispredictPenalty = 2; // Best case branch taken cost
19+
20+
let CompleteModel = 0;
21+
}

‎llvm/lib/Target/ARM/ARMSubtarget.h

+6
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,11 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
246246
/// avoid issue "normal" call instructions to callees which do not return.
247247
bool HasRetAddrStack = false;
248248

249+
/// HasBranchPredictor - True if the subtarget has a branch predictor. Having
250+
/// a branch predictor or not changes the expected cost of taking a branch
251+
/// which affects the choice of whether to use predicated instructions.
252+
bool HasBranchPredictor = true;
253+
249254
/// HasMPExtension - True if the subtarget supports Multiprocessing
250255
/// extension (ARMv7 only).
251256
bool HasMPExtension = false;
@@ -554,6 +559,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
554559
bool cheapPredicableCPSRDef() const { return CheapPredicableCPSRDef; }
555560
bool avoidMOVsShifterOperand() const { return AvoidMOVsShifterOperand; }
556561
bool hasRetAddrStack() const { return HasRetAddrStack; }
562+
bool hasBranchPredictor() const { return HasBranchPredictor; }
557563
bool hasMPExtension() const { return HasMPExtension; }
558564
bool hasDSP() const { return HasDSP; }
559565
bool useNaClTrap() const { return UseNaClTrap; }
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
; RUN: llc < %s -mtriple=thumbv7m -mcpu=cortex-m7 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BP
2+
; RUN: llc < %s -mtriple=thumbv7m -mcpu=cortex-m3 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NOBP
3+
4+
declare void @otherfn()
5+
6+
; CHECK-LABEL: triangle1:
7+
; CHECK: itt ne
8+
; CHECK: movne
9+
; CHECK: strne
10+
define i32 @triangle1(i32 %n, i32* %p) {
11+
entry:
12+
%tobool = icmp eq i32 %n, 0
13+
br i1 %tobool, label %if.end, label %if.then
14+
15+
if.then:
16+
store i32 1, i32* %p, align 4
17+
br label %if.end
18+
19+
if.end:
20+
tail call void @otherfn()
21+
ret i32 0
22+
}
23+
24+
; CHECK-LABEL: triangle2:
25+
; CHECK-BP: itttt ne
26+
; CHECK-BP: movne
27+
; CHECK-BP: strne
28+
; CHECK-BP: movne
29+
; CHECK-BP: strne
30+
; CHECK-NOBP: cbz
31+
; CHECK-NOBP: movs
32+
; CHECK-NOBP: str
33+
; CHECK-NOBP: movs
34+
; CHECK-NOBP: str
35+
define i32 @triangle2(i32 %n, i32* %p, i32* %q) {
36+
entry:
37+
%tobool = icmp eq i32 %n, 0
38+
br i1 %tobool, label %if.end, label %if.then
39+
40+
if.then:
41+
store i32 1, i32* %p, align 4
42+
store i32 2, i32* %q, align 4
43+
br label %if.end
44+
45+
if.end:
46+
tail call void @otherfn()
47+
ret i32 0
48+
}
49+
50+
; CHECK-LABEL: triangle3:
51+
; CHECK: cbz
52+
; CHECK: movs
53+
; CHECK: str
54+
; CHECK: movs
55+
; CHECK: str
56+
; CHECK: movs
57+
; CHECK: str
58+
define i32 @triangle3(i32 %n, i32* %p, i32* %q, i32* %r) {
59+
entry:
60+
%tobool = icmp eq i32 %n, 0
61+
br i1 %tobool, label %if.end, label %if.then
62+
63+
if.then:
64+
store i32 1, i32* %p, align 4
65+
store i32 2, i32* %q, align 4
66+
store i32 3, i32* %r, align 4
67+
br label %if.end
68+
69+
if.end:
70+
tail call void @otherfn()
71+
ret i32 0
72+
}
73+
74+
; CHECK-LABEL: diamond1:
75+
; CHECK: ite eq
76+
; CHECK: ldreq
77+
; CHECK: strne
78+
define i32 @diamond1(i32 %n, i32* %p) {
79+
entry:
80+
%tobool = icmp eq i32 %n, 0
81+
br i1 %tobool, label %if.else, label %if.then
82+
83+
if.then:
84+
store i32 %n, i32* %p, align 4
85+
br label %if.end
86+
87+
if.else:
88+
%0 = load i32, i32* %p, align 4
89+
br label %if.end
90+
91+
if.end:
92+
%n.addr.0 = phi i32 [ %n, %if.then ], [ %0, %if.else ]
93+
tail call void @otherfn()
94+
ret i32 %n.addr.0
95+
}
96+
97+
; CHECK-LABEL: diamond2:
98+
; CHECK-BP: itte
99+
; CHECK-BP: streq
100+
; CHECK-BP: ldreq
101+
; CHECK-BP: strne
102+
; CHECK-NOBP: cbz
103+
; CHECK-NOBP: str
104+
; CHECK-NOBP: b
105+
; CHECK-NOBP: str
106+
; CHECK-NOBP: ldr
107+
define i32 @diamond2(i32 %n, i32 %m, i32* %p, i32* %q) {
108+
entry:
109+
%tobool = icmp eq i32 %n, 0
110+
br i1 %tobool, label %if.else, label %if.then
111+
112+
if.then:
113+
store i32 %n, i32* %p, align 4
114+
br label %if.end
115+
116+
if.else:
117+
store i32 %m, i32* %q, align 4
118+
%0 = load i32, i32* %p, align 4
119+
br label %if.end
120+
121+
if.end:
122+
%n.addr.0 = phi i32 [ %n, %if.then ], [ %0, %if.else ]
123+
tail call void @otherfn()
124+
ret i32 %n.addr.0
125+
}
126+
127+
; CHECK-LABEL: diamond3:
128+
; CHECK: cbz
129+
; CHECK: movs
130+
; CHECK: str
131+
; CHECK: b
132+
; CHECK: ldr
133+
; CHECK: ldr
134+
; CHECK: adds
135+
define i32 @diamond3(i32 %n, i32* %p, i32* %q) {
136+
entry:
137+
%tobool = icmp eq i32 %n, 0
138+
br i1 %tobool, label %if.else, label %if.then
139+
140+
if.then:
141+
store i32 1, i32* %p, align 4
142+
br label %if.end
143+
144+
if.else:
145+
%0 = load i32, i32* %p, align 4
146+
%1 = load i32, i32* %q, align 4
147+
%add = add nsw i32 %1, %0
148+
br label %if.end
149+
150+
if.end:
151+
%n.addr.0 = phi i32 [ %n, %if.then ], [ %add, %if.else ]
152+
tail call void @otherfn()
153+
ret i32 %n.addr.0
154+
}

0 commit comments

Comments
 (0)
Please sign in to comment.