Skip to content

Commit 60a1e4d

Browse files
committedAug 14, 2018
[LV] Teach about non header phis that have uses outside the loop
Summary: This patch teaches the loop vectorizer to vectorize loops with non header phis that have have outside uses. This is because the iteration dependence distance for these phis can be widened upto VF (similar to how we do for induction/reduction) if they do not have a cyclic dependence with header phis. When identifying reduction/induction/first order recurrence header phis, we already identify if there are any cyclic dependencies that prevents vectorization. The vectorizer is taught to extract the last element from the vectorized phi and update the scalar loop exit block phi to contain this extracted element from the vector loop. This patch can be extended to vectorize loops where instructions other than phis have outside uses. Reviewers: Ayal, mkuper, mssimpso, efriedma Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D50579 llvm-svn: 339703
1 parent 6513f37 commit 60a1e4d

File tree

3 files changed

+247
-18
lines changed

3 files changed

+247
-18
lines changed
 

‎llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -434,8 +434,10 @@ static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
434434
/// identified reduction variable.
435435
static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
436436
SmallPtrSetImpl<Value *> &AllowedExit) {
437-
// Reduction and Induction instructions are allowed to have exit users. All
437+
// Reductions, Inductions and non-header phis are allowed to have exit users. All
438438
// other instructions must not have external users.
439+
// TODO: Non-phi instructions can also be taught to have exit users, now that
440+
// we know how to extract the last scalar element from the loop.
439441
if (!AllowedExit.count(Inst))
440442
// Check that all of the users of the loop are inside the BB.
441443
for (User *U : Inst->users()) {
@@ -597,14 +599,12 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
597599
// can convert it to select during if-conversion. No need to check if
598600
// the PHIs in this block are induction or reduction variables.
599601
if (BB != Header) {
600-
// Check that this instruction has no outside users or is an
601-
// identified reduction value with an outside user.
602-
if (!hasOutsideLoopUser(TheLoop, Phi, AllowedExit))
603-
continue;
604-
ORE->emit(createMissedAnalysis("NeitherInductionNorReduction", Phi)
605-
<< "value could not be identified as "
606-
"an induction or reduction variable");
607-
return false;
602+
// Non-header phi nodes that have outside uses can be vectorized. Add
603+
// them to the list of allowed exits.
604+
// Unsafe cyclic dependencies with header phis are identified during
605+
// legalization for reduction, induction and first order
606+
// recurrences.
607+
continue;
608608
}
609609

610610
// We only allow if-converted PHIs with exactly two incoming values.

‎llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3720,9 +3720,13 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
37203720
void InnerLoopVectorizer::fixLCSSAPHIs() {
37213721
for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
37223722
if (LCSSAPhi.getNumIncomingValues() == 1) {
3723-
assert(OrigLoop->isLoopInvariant(LCSSAPhi.getIncomingValue(0)) &&
3724-
"Incoming value isn't loop invariant");
3725-
LCSSAPhi.addIncoming(LCSSAPhi.getIncomingValue(0), LoopMiddleBlock);
3723+
auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3724+
// Can be a loop invariant incoming value or the last scalar value to be
3725+
// extracted from the vectorized loop.
3726+
Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3727+
Value *lastIncomingValue =
3728+
getOrCreateScalarValue(IncomingValue, {UF - 1, VF - 1});
3729+
LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
37263730
}
37273731
}
37283732
}

‎llvm/test/Transforms/LoopVectorize/no_outside_user.ll

Lines changed: 231 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
; RUN: opt -S -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 < %s 2>&1 | FileCheck %s
22

3-
; CHECK: remark: {{.*}}: loop not vectorized: value could not be identified as an induction or reduction variable
4-
53
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
64

75
@f = common global i32 0, align 4
@@ -11,13 +9,22 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f3
119
@b = common global i32 0, align 4
1210
@e = common global i32 0, align 4
1311

14-
; We used to vectorize this loop. But it has a value that is used outside of the
12+
; It has a value that is used outside of the loop
1513
; and is not a recognized reduction variable "tmp17".
14+
; However, tmp17 is a non-header phi which is an allowed exit.
1615

17-
; CHECK-LABEL: @main(
18-
; CHECK-NOT: <2 x i32>
16+
; CHECK-LABEL: @test1(
17+
; CHECK: %vec.ind = phi <2 x i32>
18+
; CHECK: [[CMP:%[a-zA-Z0-9.]+]] = icmp sgt <2 x i32> %vec.ind, <i32 10, i32 10>
19+
; CHECK: %predphi = select <2 x i1> [[CMP]], <2 x i32> <i32 1, i32 1>, <2 x i32> zeroinitializer
20+
21+
; CHECK-LABEL: middle.block:
22+
; CHECK: [[E1:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> %predphi, i32 1
23+
24+
; CHECK-LABEL: f1.exit.loopexit:
25+
; CHECK: %.lcssa = phi i32 [ %tmp17, %bb16 ], [ [[E1]], %middle.block ]
1926

20-
define i32 @main() {
27+
define i32 @test1() {
2128
bb:
2229
%b.promoted = load i32, i32* @b, align 4
2330
br label %.lr.ph.i
@@ -40,3 +47,221 @@ f1.exit.loopexit:
4047
%.lcssa = phi i32 [ %tmp17, %bb16 ]
4148
ret i32 %.lcssa
4249
}
50+
51+
; non-hdr phi depends on header phi.
52+
; CHECK-LABEL: @test2(
53+
; CHECK: %vec.ind = phi <2 x i32>
54+
; CHECK: [[CMP:%[a-zA-Z0-9.]+]] = icmp sgt <2 x i32> %vec.ind, <i32 10, i32 10>
55+
; CHECK: %predphi = select <2 x i1> [[CMP]], <2 x i32> <i32 1, i32 1>, <2 x i32> %vec.ind
56+
57+
; CHECK-LABEL: middle.block:
58+
; CHECK: [[E1:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> %predphi, i32 1
59+
60+
; CHECK-LABEL: f1.exit.loopexit:
61+
; CHECK: %.lcssa = phi i32 [ %tmp17, %bb16 ], [ [[E1]], %middle.block ]
62+
define i32 @test2() {
63+
bb:
64+
%b.promoted = load i32, i32* @b, align 4
65+
br label %.lr.ph.i
66+
67+
.lr.ph.i:
68+
%tmp8 = phi i32 [ %tmp18, %bb16 ], [ %b.promoted, %bb ]
69+
%tmp2 = icmp sgt i32 %tmp8, 10
70+
br i1 %tmp2, label %bb16, label %bb10
71+
72+
bb10:
73+
br label %bb16
74+
75+
bb16:
76+
%tmp17 = phi i32 [ %tmp8, %bb10 ], [ 1, %.lr.ph.i ]
77+
%tmp18 = add nsw i32 %tmp8, 1
78+
%tmp19 = icmp slt i32 %tmp18, 4
79+
br i1 %tmp19, label %.lr.ph.i, label %f1.exit.loopexit
80+
81+
f1.exit.loopexit:
82+
%.lcssa = phi i32 [ %tmp17, %bb16 ]
83+
ret i32 %.lcssa
84+
}
85+
86+
; more than 2 incoming values for tmp17 phi that is used outside loop.
87+
; CHECK-LABEL: test3(
88+
; CHECK-LABEL: vector.body:
89+
; CHECK: %predphi = select <2 x i1> %{{.*}}, <2 x i32> <i32 1, i32 1>, <2 x i32> zeroinitializer
90+
; CHECK: %predphi1 = select <2 x i1> %{{.*}}, <2 x i32> <i32 2, i32 2>, <2 x i32> %predphi
91+
92+
; CHECK-LABEL: middle.block:
93+
; CHECK: [[E1:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> %predphi1, i32 1
94+
95+
; CHECK-LABEL: f1.exit.loopexit:
96+
; CHECK: phi i32 [ %tmp17, %bb16 ], [ [[E1]], %middle.block ]
97+
define i32 @test3(i32 %N) {
98+
bb:
99+
%b.promoted = load i32, i32* @b, align 4
100+
br label %.lr.ph.i
101+
102+
.lr.ph.i:
103+
%tmp8 = phi i32 [ %tmp18, %bb16 ], [ %b.promoted, %bb ]
104+
%tmp2 = icmp sgt i32 %tmp8, 10
105+
br i1 %tmp2, label %bb16, label %bb10
106+
107+
bb10:
108+
%cmp = icmp sgt i32 %tmp8, %N
109+
br i1 %cmp, label %bb12, label %bb16
110+
111+
bb12:
112+
br label %bb16
113+
114+
bb16:
115+
%tmp17 = phi i32 [ 0, %bb10 ], [ 1, %.lr.ph.i ], [ 2, %bb12 ]
116+
%tmp18 = add nsw i32 %tmp8, 1
117+
%tmp19 = icmp slt i32 %tmp18, 4
118+
br i1 %tmp19, label %.lr.ph.i, label %f1.exit.loopexit
119+
120+
f1.exit.loopexit:
121+
%.lcssa = phi i32 [ %tmp17, %bb16 ]
122+
ret i32 %.lcssa
123+
}
124+
125+
; more than one incoming value for outside user: %.lcssa
126+
; CHECK-LABEL: test4(
127+
; CHECK-LABEL: vector.body:
128+
; CHECK: %predphi = select <2 x i1>
129+
130+
; CHECK-LABEL: middle.block:
131+
; CHECK: [[E1:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> %predphi, i32 1
132+
133+
; CHECK-LABEL: f1.exit.loopexit.loopexit:
134+
; CHECK: %tmp17.lcssa = phi i32 [ %tmp17, %bb16 ], [ [[E1]], %middle.block ]
135+
; CHECK-NEXT: br label %f1.exit.loopexit
136+
137+
; CHECK-LABEL: f1.exit.loopexit:
138+
; CHECK: %.lcssa = phi i32 [ 2, %bb ], [ %tmp17.lcssa, %f1.exit.loopexit.loopexit ]
139+
define i32 @test4(i32 %N) {
140+
bb:
141+
%b.promoted = load i32, i32* @b, align 4
142+
%icmp = icmp slt i32 %b.promoted, %N
143+
br i1 %icmp, label %f1.exit.loopexit, label %.lr.ph.i
144+
145+
.lr.ph.i:
146+
%tmp8 = phi i32 [ %tmp18, %bb16 ], [ %b.promoted, %bb ]
147+
%tmp2 = icmp sgt i32 %tmp8, 10
148+
br i1 %tmp2, label %bb16, label %bb10
149+
150+
bb10:
151+
br label %bb16
152+
153+
bb16:
154+
%tmp17 = phi i32 [ 0, %bb10 ], [ 1, %.lr.ph.i ]
155+
%tmp18 = add nsw i32 %tmp8, 1
156+
%tmp19 = icmp slt i32 %tmp18, 4
157+
br i1 %tmp19, label %.lr.ph.i, label %f1.exit.loopexit
158+
159+
f1.exit.loopexit:
160+
%.lcssa = phi i32 [ %tmp17, %bb16 ], [ 2, %bb ]
161+
ret i32 %.lcssa
162+
}
163+
164+
; non hdr phi that depends on reduction and is used outside the loop.
165+
; reduction phis are only allowed to have bump or reduction operations as the inside user, so we should
166+
; not vectorize this.
167+
; CHECK-LABEL: reduction_sum(
168+
; CHECK-NOT: <2 x i32>
169+
define i32 @reduction_sum(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
170+
entry:
171+
%c1 = icmp sgt i32 %n, 0
172+
br i1 %c1, label %header, label %._crit_edge
173+
174+
header: ; preds = %0, %.lr.ph
175+
%indvars.iv = phi i64 [ %indvars.iv.next, %bb16 ], [ 0, %entry ]
176+
%sum.02 = phi i32 [ %c9, %bb16 ], [ 0, %entry ]
177+
%c2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
178+
%c3 = load i32, i32* %c2, align 4
179+
%c4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
180+
%c5 = load i32, i32* %c4, align 4
181+
%tmp2 = icmp sgt i32 %sum.02, 10
182+
br i1 %tmp2, label %bb16, label %bb10
183+
184+
bb10:
185+
br label %bb16
186+
187+
bb16:
188+
%tmp17 = phi i32 [ %sum.02, %bb10 ], [ 1, %header ]
189+
%c6 = trunc i64 %indvars.iv to i32
190+
%c7 = add i32 %sum.02, %c6
191+
%c8 = add i32 %c7, %c3
192+
%c9 = add i32 %c8, %c5
193+
%indvars.iv.next = add i64 %indvars.iv, 1
194+
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
195+
%exitcond = icmp eq i32 %lftr.wideiv, %n
196+
br i1 %exitcond, label %._crit_edge, label %header
197+
198+
._crit_edge: ; preds = %.lr.ph, %0
199+
%sum.0.lcssa = phi i32 [ 0, %entry ], [ %c9, %bb16 ]
200+
%nonhdr.lcssa = phi i32 [ 1, %entry], [ %tmp17, %bb16 ]
201+
ret i32 %sum.0.lcssa
202+
}
203+
204+
; invalid cyclic dependency with header phi iv, which prevents iv from being
205+
; recognized as induction var.
206+
; cannot vectorize.
207+
; CHECK-LABEL: cyclic_dep_with_indvar(
208+
; CHECK-NOT: <2 x i32>
209+
define i32 @cyclic_dep_with_indvar() {
210+
bb:
211+
%b.promoted = load i32, i32* @b, align 4
212+
br label %.lr.ph.i
213+
214+
.lr.ph.i:
215+
%iv = phi i32 [ %ivnext, %bb16 ], [ %b.promoted, %bb ]
216+
%tmp2 = icmp sgt i32 %iv, 10
217+
br i1 %tmp2, label %bb16, label %bb10
218+
219+
bb10:
220+
br label %bb16
221+
222+
bb16:
223+
%tmp17 = phi i32 [ 0, %bb10 ], [ %iv, %.lr.ph.i ]
224+
%ivnext = add nsw i32 %tmp17, 1
225+
%tmp19 = icmp slt i32 %ivnext, 4
226+
br i1 %tmp19, label %.lr.ph.i, label %f1.exit.loopexit
227+
228+
f1.exit.loopexit:
229+
%.lcssa = phi i32 [ %tmp17, %bb16 ]
230+
ret i32 %.lcssa
231+
}
232+
233+
; non-reduction phi 'tmp17' used outside loop has cyclic dependence with %x.05 phi
234+
; cannot vectorize.
235+
; CHECK-LABEL: not_valid_reduction(
236+
; CHECK-NOT: <2 x i32>
237+
define i32 @not_valid_reduction(i32 %n, i32* noalias nocapture %A) nounwind uwtable readonly {
238+
entry:
239+
%cmp4 = icmp sgt i32 %n, 0
240+
br i1 %cmp4, label %for.body, label %for.end
241+
242+
for.body: ; preds = %entry, %for.body
243+
%indvars.iv = phi i64 [ %indvars.iv.next, %latch ], [ 0, %entry ]
244+
%x.05 = phi i32 [ %tmp17, %latch ], [ 0, %entry ]
245+
%arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
246+
%tmp0 = load i32, i32* %arrayidx, align 4
247+
%tmp2 = icmp sgt i64 %indvars.iv, 10
248+
%sub = sub nsw i32 %x.05, %tmp0
249+
br i1 %tmp2, label %bb16, label %bb10
250+
251+
bb10:
252+
br label %bb16
253+
254+
bb16:
255+
%tmp17 = phi i32 [ 1, %bb10 ], [ %sub, %for.body ]
256+
br label %latch
257+
258+
latch:
259+
%indvars.iv.next = add i64 %indvars.iv, 1
260+
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
261+
%exitcond = icmp eq i32 %lftr.wideiv, %n
262+
br i1 %exitcond, label %for.end, label %for.body
263+
264+
for.end: ; preds = %for.body, %entry
265+
%x.0.lcssa = phi i32 [ 0, %entry ], [ %tmp17 , %latch ]
266+
ret i32 %x.0.lcssa
267+
}

0 commit comments

Comments
 (0)