Skip to content

Commit 7f4b26e

Browse files
committedFeb 2, 2017
[LICM] Hoist loads that are dominated by invariant.start intrinsic, and are invariant in the loop.
Summary: We can hoist out loads that are dominated by invariant.start, to the preheader. We conservatively assume the load is variant, if we see a corresponding use of invariant.start (it could be an invariant.end or an escaping call). Reviewers: mkuper, sanjoy, reames Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D29331 llvm-svn: 293887
1 parent fc19a8f commit 7f4b26e

File tree

2 files changed

+233
-0
lines changed

2 files changed

+233
-0
lines changed
 

‎llvm/lib/Transforms/Scalar/LICM.cpp

+62
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,11 @@ static cl::opt<bool>
8181
DisablePromotion("disable-licm-promotion", cl::Hidden,
8282
cl::desc("Disable memory promotion in LICM pass"));
8383

84+
static cl::opt<uint32_t> MaxNumUsesTraversed(
85+
"licm-max-num-uses-traversed", cl::Hidden, cl::init(8),
86+
cl::desc("Max num uses visited for identifying load "
87+
"invariance in loop using invariant start (default = 8)"));
88+
8489
static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
8590
static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
8691
const LoopSafetyInfo *SafetyInfo);
@@ -480,6 +485,59 @@ void llvm::computeLoopSafetyInfo(LoopSafetyInfo *SafetyInfo, Loop *CurLoop) {
480485
SafetyInfo->BlockColors = colorEHFunclets(*Fn);
481486
}
482487

488+
// Return true if LI is invariant within scope of the loop. LI is invariant if
489+
// CurLoop is dominated by an invariant.start representing the same memory location
490+
// and size as the memory location LI loads from, and also the invariant.start
491+
// has no uses.
492+
static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT,
493+
Loop *CurLoop) {
494+
Value *Addr = LI->getOperand(0);
495+
const DataLayout &DL = LI->getModule()->getDataLayout();
496+
const uint32_t LocSizeInBits = DL.getTypeSizeInBits(
497+
cast<PointerType>(Addr->getType())->getElementType());
498+
499+
// if the type is i8 addrspace(x)*, we know this is the type of
500+
// llvm.invariant.start operand
501+
auto *PtrInt8Ty = PointerType::get(Type::getInt8Ty(LI->getContext()),
502+
LI->getPointerAddressSpace());
503+
unsigned BitcastsVisited = 0;
504+
// Look through bitcasts until we reach the i8* type (this is invariant.start
505+
// operand type).
506+
while (Addr->getType() != PtrInt8Ty) {
507+
auto *BC = dyn_cast<BitCastInst>(Addr);
508+
// Avoid traversing high number of bitcast uses.
509+
if (++BitcastsVisited > MaxNumUsesTraversed || !BC)
510+
return false;
511+
Addr = BC->getOperand(0);
512+
}
513+
514+
unsigned UsesVisited = 0;
515+
// Traverse all uses of the load operand value, to see if invariant.start is
516+
// one of the uses, and whether it dominates the load instruction.
517+
for (auto *U : Addr->users()) {
518+
// Avoid traversing for Load operand with high number of users.
519+
if (++UsesVisited > MaxNumUsesTraversed)
520+
return false;
521+
IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
522+
// If there are escaping uses of invariant.start instruction, the load maybe
523+
// non-invariant.
524+
if (!II || II->getIntrinsicID() != Intrinsic::invariant_start ||
525+
II->hasNUsesOrMore(1))
526+
continue;
527+
unsigned InvariantSizeInBits =
528+
cast<ConstantInt>(II->getArgOperand(0))->getSExtValue() * 8;
529+
// Confirm the invariant.start location size contains the load operand size
530+
// in bits. Also, the invariant.start should dominate the load, and we
531+
// should not hoist the load out of a loop that contains this dominating
532+
// invariant.start.
533+
if (LocSizeInBits <= InvariantSizeInBits &&
534+
DT->properlyDominates(II->getParent(), CurLoop->getHeader()))
535+
return true;
536+
}
537+
538+
return false;
539+
}
540+
483541
bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
484542
Loop *CurLoop, AliasSetTracker *CurAST,
485543
LoopSafetyInfo *SafetyInfo,
@@ -496,6 +554,10 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
496554
if (LI->getMetadata(LLVMContext::MD_invariant_load))
497555
return true;
498556

557+
// This checks for an invariant.start dominating the load.
558+
if (isLoadInvariantInLoop(LI, DT, CurLoop))
559+
return true;
560+
499561
// Don't hoist loads which have may-aliased stores in loop.
500562
uint64_t Size = 0;
501563
if (LI->getType()->isSized())

‎llvm/test/Transforms/LICM/hoisting.ll

+171
Original file line numberDiff line numberDiff line change
@@ -149,3 +149,174 @@ latch:
149149
return:
150150
ret i32 %sum
151151
}
152+
153+
declare {}* @llvm.invariant.start.p0i8(i64, i8* nocapture) nounwind readonly
154+
declare void @llvm.invariant.end.p0i8({}*, i64, i8* nocapture) nounwind
155+
declare void @escaping.invariant.start({}*) nounwind
156+
; invariant.start dominates the load, and in this scope, the
157+
; load is invariant. So, we can hoist the `addrld` load out of the loop.
158+
define i32 @test_fence(i8* %addr, i32 %n, i8* %volatile) {
159+
; CHECK-LABEL: @test_fence
160+
; CHECK-LABEL: entry
161+
; CHECK: invariant.start
162+
; CHECK: %addrld = load atomic i32, i32* %addr.i unordered, align 8
163+
; CHECK: br label %loop
164+
entry:
165+
%gep = getelementptr inbounds i8, i8* %addr, i64 8
166+
%addr.i = bitcast i8* %gep to i32 *
167+
store atomic i32 5, i32 * %addr.i unordered, align 8
168+
fence release
169+
%invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep)
170+
br label %loop
171+
172+
loop:
173+
%indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
174+
%sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
175+
%volload = load atomic i8, i8* %volatile unordered, align 8
176+
fence acquire
177+
%volchk = icmp eq i8 %volload, 0
178+
%addrld = load atomic i32, i32* %addr.i unordered, align 8
179+
%sel = select i1 %volchk, i32 0, i32 %addrld
180+
%sum.next = add i32 %sel, %sum
181+
%indvar.next = add i32 %indvar, 1
182+
%cond = icmp slt i32 %indvar.next, %n
183+
br i1 %cond, label %loop, label %loopexit
184+
185+
loopexit:
186+
ret i32 %sum
187+
}
188+
189+
190+
191+
; Same as test above, but the load is no longer invariant (presence of
192+
; invariant.end). We cannot hoist the addrld out of loop.
193+
define i32 @test_fence1(i8* %addr, i32 %n, i8* %volatile) {
194+
; CHECK-LABEL: @test_fence1
195+
; CHECK-LABEL: entry
196+
; CHECK: invariant.start
197+
; CHECK-NEXT: invariant.end
198+
; CHECK-NEXT: br label %loop
199+
entry:
200+
%gep = getelementptr inbounds i8, i8* %addr, i64 8
201+
%addr.i = bitcast i8* %gep to i32 *
202+
store atomic i32 5, i32 * %addr.i unordered, align 8
203+
fence release
204+
%invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep)
205+
call void @llvm.invariant.end.p0i8({}* %invst, i64 4, i8* %gep)
206+
br label %loop
207+
208+
loop:
209+
%indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
210+
%sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
211+
%volload = load atomic i8, i8* %volatile unordered, align 8
212+
fence acquire
213+
%volchk = icmp eq i8 %volload, 0
214+
%addrld = load atomic i32, i32* %addr.i unordered, align 8
215+
%sel = select i1 %volchk, i32 0, i32 %addrld
216+
%sum.next = add i32 %sel, %sum
217+
%indvar.next = add i32 %indvar, 1
218+
%cond = icmp slt i32 %indvar.next, %n
219+
br i1 %cond, label %loop, label %loopexit
220+
221+
loopexit:
222+
ret i32 %sum
223+
}
224+
225+
; same as test above, but instead of invariant.end, we have the result of
226+
; invariant.start escaping through a call. We cannot hoist the load.
227+
define i32 @test_fence2(i8* %addr, i32 %n, i8* %volatile) {
228+
; CHECK-LABEL: @test_fence2
229+
; CHECK-LABEL: entry
230+
; CHECK-NOT: load
231+
; CHECK: br label %loop
232+
entry:
233+
%gep = getelementptr inbounds i8, i8* %addr, i64 8
234+
%addr.i = bitcast i8* %gep to i32 *
235+
store atomic i32 5, i32 * %addr.i unordered, align 8
236+
fence release
237+
%invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep)
238+
call void @escaping.invariant.start({}* %invst)
239+
br label %loop
240+
241+
loop:
242+
%indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
243+
%sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
244+
%volload = load atomic i8, i8* %volatile unordered, align 8
245+
fence acquire
246+
%volchk = icmp eq i8 %volload, 0
247+
%addrld = load atomic i32, i32* %addr.i unordered, align 8
248+
%sel = select i1 %volchk, i32 0, i32 %addrld
249+
%sum.next = add i32 %sel, %sum
250+
%indvar.next = add i32 %indvar, 1
251+
%cond = icmp slt i32 %indvar.next, %n
252+
br i1 %cond, label %loop, label %loopexit
253+
254+
loopexit:
255+
ret i32 %sum
256+
}
257+
258+
; FIXME: invariant.start dominates the load, and in this scope, the
259+
; load is invariant. So, we can hoist the `addrld` load out of the loop.
260+
; Consider the loadoperand addr.i bitcasted before being passed to
261+
; invariant.start
262+
define i32 @test_fence3(i32* %addr, i32 %n, i8* %volatile) {
263+
; CHECK-LABEL: @test_fence3
264+
; CHECK-LABEL: entry
265+
; CHECK: invariant.start
266+
; CHECK-NOT: %addrld = load atomic i32, i32* %addr.i unordered, align 8
267+
; CHECK: br label %loop
268+
entry:
269+
%addr.i = getelementptr inbounds i32, i32* %addr, i64 8
270+
%gep = bitcast i32* %addr.i to i8 *
271+
store atomic i32 5, i32 * %addr.i unordered, align 8
272+
fence release
273+
%invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep)
274+
br label %loop
275+
276+
loop:
277+
%indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
278+
%sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
279+
%volload = load atomic i8, i8* %volatile unordered, align 8
280+
fence acquire
281+
%volchk = icmp eq i8 %volload, 0
282+
%addrld = load atomic i32, i32* %addr.i unordered, align 8
283+
%sel = select i1 %volchk, i32 0, i32 %addrld
284+
%sum.next = add i32 %sel, %sum
285+
%indvar.next = add i32 %indvar, 1
286+
%cond = icmp slt i32 %indvar.next, %n
287+
br i1 %cond, label %loop, label %loopexit
288+
289+
loopexit:
290+
ret i32 %sum
291+
}
292+
293+
; We should not hoist the addrld out of the loop.
294+
define i32 @test_fence4(i32* %addr, i32 %n, i8* %volatile) {
295+
; CHECK-LABEL: @test_fence4
296+
; CHECK-LABEL: entry
297+
; CHECK-NOT: %addrld = load atomic i32, i32* %addr.i unordered, align 8
298+
; CHECK: br label %loop
299+
entry:
300+
%addr.i = getelementptr inbounds i32, i32* %addr, i64 8
301+
%gep = bitcast i32* %addr.i to i8 *
302+
br label %loop
303+
304+
loop:
305+
%indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
306+
%sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
307+
store atomic i32 5, i32 * %addr.i unordered, align 8
308+
fence release
309+
%invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep)
310+
%volload = load atomic i8, i8* %volatile unordered, align 8
311+
fence acquire
312+
%volchk = icmp eq i8 %volload, 0
313+
%addrld = load atomic i32, i32* %addr.i unordered, align 8
314+
%sel = select i1 %volchk, i32 0, i32 %addrld
315+
%sum.next = add i32 %sel, %sum
316+
%indvar.next = add i32 %indvar, 1
317+
%cond = icmp slt i32 %indvar.next, %n
318+
br i1 %cond, label %loop, label %loopexit
319+
320+
loopexit:
321+
ret i32 %sum
322+
}

0 commit comments

Comments
 (0)
Please sign in to comment.