Skip to content

Commit 8fc2cbf

Browse files
committedDec 8, 2015
[EarlyCSE] Value forwarding for unordered atomics
This patch teaches the fully redundant load part of EarlyCSE how to forward from atomic and volatile loads and stores, and how to eliminate unordered atomics (only). This patch does not include dead store elimination support for unordered atomics, that will follow in the near future. The basic idea is that we allow all loads and stores to be tracked by the AvailableLoad table. We store a bit in the table which tracks whether load/store was atomic, and then only replace atomic loads with ones which were also atomic. No attempt is made to refine our handling of ordered loads or stores. Those are still treated as full fences. We could pretty easily extend the release fence handling to release stores, but that should be a separate patch. Differential Revision: http://reviews.llvm.org/D15337 llvm-svn: 255054
1 parent 0aea1b8 commit 8fc2cbf

File tree

2 files changed

+204
-19
lines changed

2 files changed

+204
-19
lines changed
 

‎llvm/lib/Transforms/Scalar/EarlyCSE.cpp

+77-19
Original file line numberDiff line numberDiff line change
@@ -281,21 +281,31 @@ class EarlyCSE {
281281
/// that dominated values can succeed in their lookup.
282282
ScopedHTType AvailableValues;
283283

284-
/// \brief A scoped hash table of the current values of loads.
284+
/// A scoped hash table of the current values of previously encounted memory
285+
/// locations.
285286
///
286-
/// This allows us to get efficient access to dominating loads when we have
287-
/// a fully redundant load. In addition to the most recent load, we keep
288-
/// track of a generation count of the read, which is compared against the
289-
/// current generation count. The current generation count is incremented
287+
/// This allows us to get efficient access to dominating loads or stores when
288+
/// we have a fully redundant load. In addition to the most recent load, we
289+
/// keep track of a generation count of the read, which is compared against
290+
/// the current generation count. The current generation count is incremented
290291
/// after every possibly writing memory operation, which ensures that we only
291-
/// CSE loads with other loads that have no intervening store.
292+
/// CSE loads with other loads that have no intervening store. Ordering
293+
/// events (such as fences or atomic instructions) increment the generation
294+
/// count as well; essentially, we model these as writes to all possible
295+
/// locations. Note that atomic and/or volatile loads and stores can be
296+
/// present the table; it is the responsibility of the consumer to inspect
297+
/// the atomicity/volatility if needed.
292298
struct LoadValue {
293299
Value *Data;
294300
unsigned Generation;
295301
int MatchingId;
296-
LoadValue() : Data(nullptr), Generation(0), MatchingId(-1) {}
297-
LoadValue(Value *Data, unsigned Generation, unsigned MatchingId)
298-
: Data(Data), Generation(Generation), MatchingId(MatchingId) {}
302+
bool IsAtomic;
303+
LoadValue()
304+
: Data(nullptr), Generation(0), MatchingId(-1), IsAtomic(false) {}
305+
LoadValue(Value *Data, unsigned Generation, unsigned MatchingId,
306+
bool IsAtomic)
307+
: Data(Data), Generation(Generation), MatchingId(MatchingId),
308+
IsAtomic(IsAtomic) {}
299309
};
300310
typedef RecyclingAllocator<BumpPtrAllocator,
301311
ScopedHashTableVal<Value *, LoadValue>>
@@ -410,6 +420,42 @@ class EarlyCSE {
410420
}
411421
return Inst->isAtomic();
412422
}
423+
bool isAtomic() const {
424+
if (IsTargetMemInst) {
425+
assert(Info.IsSimple && "need to refine IsSimple in TTI");
426+
return false;
427+
}
428+
return Inst->isAtomic();
429+
}
430+
bool isUnordered() const {
431+
if (IsTargetMemInst) {
432+
assert(Info.IsSimple && "need to refine IsSimple in TTI");
433+
return true;
434+
}
435+
if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
436+
return LI->isUnordered();
437+
} else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
438+
return SI->isUnordered();
439+
}
440+
// Conservative answer
441+
return !Inst->isAtomic();
442+
}
443+
444+
bool isVolatile() const {
445+
if (IsTargetMemInst) {
446+
assert(Info.IsSimple && "need to refine IsSimple in TTI");
447+
return false;
448+
}
449+
if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
450+
return LI->isVolatile();
451+
} else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
452+
return SI->isVolatile();
453+
}
454+
// Conservative answer
455+
return true;
456+
}
457+
458+
413459
bool isMatchingMemLoc(const ParseMemoryInst &Inst) const {
414460
return (getPointerOperand() == Inst.getPointerOperand() &&
415461
getMatchingId() == Inst.getMatchingId());
@@ -561,20 +607,22 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
561607
ParseMemoryInst MemInst(Inst, TTI);
562608
// If this is a non-volatile load, process it.
563609
if (MemInst.isValid() && MemInst.isLoad()) {
564-
// Ignore volatile or ordered loads.
565-
if (!MemInst.isSimple()) {
610+
// (conservatively) we can't peak past the ordering implied by this
611+
// operation, but we can add this load to our set of available values
612+
if (MemInst.isVolatile() || !MemInst.isUnordered()) {
566613
LastStore = nullptr;
567-
// Don't CSE across synchronization boundaries.
568-
if (Inst->mayWriteToMemory())
569-
++CurrentGeneration;
570-
continue;
614+
++CurrentGeneration;
571615
}
572616

573617
// If we have an available version of this load, and if it is the right
574618
// generation, replace this instruction.
575619
LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
576620
if (InVal.Data != nullptr && InVal.Generation == CurrentGeneration &&
577-
InVal.MatchingId == MemInst.getMatchingId()) {
621+
InVal.MatchingId == MemInst.getMatchingId() &&
622+
// We don't yet handle removing loads with ordering of any kind.
623+
!MemInst.isVolatile() && MemInst.isUnordered() &&
624+
// We can't replace an atomic load with one which isn't also atomic.
625+
InVal.IsAtomic >= MemInst.isAtomic()) {
578626
Value *Op = getOrCreateResult(InVal.Data, Inst->getType());
579627
if (Op != nullptr) {
580628
DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst
@@ -591,7 +639,8 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
591639
// Otherwise, remember that we have this instruction.
592640
AvailableLoads.insert(
593641
MemInst.getPointerOperand(),
594-
LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId()));
642+
LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(),
643+
MemInst.isAtomic()));
595644
LastStore = nullptr;
596645
continue;
597646
}
@@ -646,9 +695,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
646695

647696
if (MemInst.isValid() && MemInst.isStore()) {
648697
// We do a trivial form of DSE if there are two stores to the same
649-
// location with no intervening loads. Delete the earlier store.
698+
// location with no intervening loads. Delete the earlier store. Note
699+
// that we can delete an earlier simple store even if the following one
700+
// is ordered/volatile/atomic store.
650701
if (LastStore) {
651702
ParseMemoryInst LastStoreMemInst(LastStore, TTI);
703+
assert(LastStoreMemInst.isSimple() && "Violated invariant");
652704
if (LastStoreMemInst.isMatchingMemLoc(MemInst)) {
653705
DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
654706
<< " due to: " << *Inst << '\n');
@@ -667,11 +719,17 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
667719
// the store.
668720
AvailableLoads.insert(
669721
MemInst.getPointerOperand(),
670-
LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId()));
722+
LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(),
723+
MemInst.isAtomic()));
671724

672725
// Remember that this was the last normal store we saw for DSE.
726+
// Note that we can't delete an earlier atomic or volatile store in
727+
// favor of a later one which isn't. We could in principal remove an
728+
// earlier unordered store if the later one is also unordered.
673729
if (MemInst.isSimple())
674730
LastStore = Inst;
731+
else
732+
LastStore = nullptr;
675733
}
676734
}
677735
}
+127
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
; RUN: opt < %s -S -early-cse | FileCheck %s
2+
3+
; CHECK-LABEL: @test12(
4+
define i32 @test12(i1 %B, i32* %P1, i32* %P2) {
5+
%load0 = load i32, i32* %P1
6+
%1 = load atomic i32, i32* %P2 seq_cst, align 4
7+
%load1 = load i32, i32* %P1
8+
%sel = select i1 %B, i32 %load0, i32 %load1
9+
ret i32 %sel
10+
; CHECK: load i32, i32* %P1
11+
; CHECK: load i32, i32* %P1
12+
}
13+
14+
; CHECK-LABEL: @test13(
15+
; atomic to non-atomic forwarding is legal
16+
define i32 @test13(i1 %B, i32* %P1) {
17+
%a = load atomic i32, i32* %P1 seq_cst, align 4
18+
%b = load i32, i32* %P1
19+
%res = sub i32 %a, %b
20+
ret i32 %res
21+
; CHECK: load atomic i32, i32* %P1
22+
; CHECK: ret i32 0
23+
}
24+
25+
; CHECK-LABEL: @test14(
26+
; atomic to unordered atomic forwarding is legal
27+
define i32 @test14(i1 %B, i32* %P1) {
28+
%a = load atomic i32, i32* %P1 seq_cst, align 4
29+
%b = load atomic i32, i32* %P1 unordered, align 4
30+
%res = sub i32 %a, %b
31+
ret i32 %res
32+
; CHECK: load atomic i32, i32* %P1
33+
; CHECK: ret i32 0
34+
}
35+
36+
; CHECK-LABEL: @test15(
37+
; implementation restiction: can't forward to stonger
38+
; than unordered
39+
define i32 @test15(i1 %B, i32* %P1, i32* %P2) {
40+
%a = load atomic i32, i32* %P1 seq_cst, align 4
41+
%b = load atomic i32, i32* %P1 seq_cst, align 4
42+
%res = sub i32 %a, %b
43+
ret i32 %res
44+
; CHECK: load atomic i32, i32* %P1
45+
; CHECK: load atomic i32, i32* %P1
46+
}
47+
48+
; CHECK-LABEL: @test16(
49+
; forwarding non-atomic to atomic is wrong! (However,
50+
; it would be legal to use the later value in place of the
51+
; former in this particular example. We just don't
52+
; do that right now.)
53+
define i32 @test16(i1 %B, i32* %P1, i32* %P2) {
54+
%a = load i32, i32* %P1, align 4
55+
%b = load atomic i32, i32* %P1 unordered, align 4
56+
%res = sub i32 %a, %b
57+
ret i32 %res
58+
; CHECK: load i32, i32* %P1
59+
; CHECK: load atomic i32, i32* %P1
60+
}
61+
62+
; Can't DSE across a full fence
63+
define void @test17(i1 %B, i32* %P1, i32* %P2) {
64+
; CHECK-LABEL: @test17
65+
; CHECK: store
66+
; CHECK: store atomic
67+
; CHECK: store
68+
store i32 0, i32* %P1, align 4
69+
store atomic i32 0, i32* %P2 seq_cst, align 4
70+
store i32 0, i32* %P1, align 4
71+
ret void
72+
}
73+
74+
; Can't remove a volatile load
75+
define i32 @test18(i1 %B, i32* %P1, i32* %P2) {
76+
%a = load i32, i32* %P1, align 4
77+
%b = load volatile i32, i32* %P1, align 4
78+
%res = sub i32 %a, %b
79+
ret i32 %res
80+
; CHECK-LABEL: @test18
81+
; CHECK: load i32, i32* %P1
82+
; CHECK: load volatile i32, i32* %P1
83+
}
84+
85+
; Can't DSE a volatile store
86+
define void @test19(i1 %B, i32* %P1, i32* %P2) {
87+
; CHECK-LABEL: @test19
88+
; CHECK: store volatile
89+
; CHECK: store
90+
store volatile i32 0, i32* %P1, align 4
91+
store i32 3, i32* %P1, align 4
92+
ret void
93+
}
94+
95+
; Can value forward from volailes
96+
define i32 @test20(i1 %B, i32* %P1, i32* %P2) {
97+
%a = load volatile i32, i32* %P1, align 4
98+
%b = load i32, i32* %P1, align 4
99+
%res = sub i32 %a, %b
100+
ret i32 %res
101+
; CHECK-LABEL: @test20
102+
; CHECK: load volatile i32, i32* %P1
103+
; CHECK: ret i32 0
104+
}
105+
106+
; Can DSE a non-volatile store in favor of a volatile one
107+
; currently a missed optimization
108+
define void @test21(i1 %B, i32* %P1, i32* %P2) {
109+
; CHECK-LABEL: @test21
110+
; CHECK: store
111+
; CHECK: store volatile
112+
store i32 0, i32* %P1, align 4
113+
store volatile i32 3, i32* %P1, align 4
114+
ret void
115+
}
116+
117+
; Can DSE a normal store in favor of a unordered one
118+
define void @test22(i1 %B, i32* %P1, i32* %P2) {
119+
; CHECK-LABEL: @test22
120+
; CHECK-NEXT: store atomic
121+
store i32 0, i32* %P1, align 4
122+
store atomic i32 3, i32* %P1 unordered, align 4
123+
ret void
124+
}
125+
126+
127+

0 commit comments

Comments
 (0)
Please sign in to comment.