Index: lib/Analysis/MemoryDependenceAnalysis.cpp =================================================================== --- lib/Analysis/MemoryDependenceAnalysis.cpp +++ lib/Analysis/MemoryDependenceAnalysis.cpp @@ -370,6 +370,36 @@ int64_t MemLocOffset = 0; unsigned Limit = BlockScanLimit; bool isInvariantLoad = false; + + // We must be careful with atomic accesses, as they may allow another thread + // to touch this location, cloberring it. We are conservative: if the + // QueryInst is not a simple (non-atomic) memory access, we automatically + // return getClobber. + // If it is simple, we know based on the results of + // "Compiler testing via a theory of sound optimisations in the C11/C++11 + // memory model" in PLDI 2013, that a non-atomic location can only be + // clobbered between a pair of a release and an acquire action, with no + // access to the location in between. + // Here is an example for giving the general intuition behind this rule. + // In the following code: + // store x 0; + // release action; [1] + // acquire action; [4] + // %val = load x; + // It is unsafe to replace %val by 0 because another thread may be running: + // acquire action; [2] + // store x 42; + // release action; [3] + // with synchronization from 1 to 2 and from 3 to 4, resulting in %val + // being 42. A key property of this program however is that if either + // 1 or 4 were missing, there would be a race between the store of 42 + // either the store of 0 or the load (making the whole progam racy). + // The paper mentionned above shows that the same property is respected + // by every program that can detect any optimisation of that kind: either + // it is racy (undefined) or there is a release followed by an acquire + // between the pair of accesses under consideration. + bool HasSeenAcquire = false; + if (isLoad && QueryInst) { LoadInst *LI = dyn_cast(QueryInst); if (LI && LI->getMetadata(LLVMContext::MD_invariant_load) != nullptr) @@ -412,19 +442,21 @@ // be accessing the location. if (LoadInst *LI = dyn_cast(Inst)) { // Atomic loads have complications involved. - // A monotonic load is OK if the query inst is itself not atomic. + // A Monotonic (or higher) load is OK if the query inst is itself not atomic. + // An Acquire (or higher) load sets the HasSeenAcquire flag, so that any + // release store will know to return getClobber. // FIXME: This is overly conservative. if (!LI->isUnordered()) { if (!QueryInst) return MemDepResult::getClobber(LI); - if (LI->getOrdering() != Monotonic) - return MemDepResult::getClobber(LI); if (auto *QueryLI = dyn_cast(QueryInst)) if (!QueryLI->isSimple()) return MemDepResult::getClobber(LI); if (auto *QuerySI = dyn_cast(QueryInst)) if (!QuerySI->isSimple()) return MemDepResult::getClobber(LI); + if (isAtLeastAcquire(LI->getOrdering())) + HasSeenAcquire = true; } // FIXME: this is overly conservative. @@ -490,19 +522,21 @@ if (StoreInst *SI = dyn_cast(Inst)) { // Atomic stores have complications involved. - // A monotonic store is OK if the query inst is itself not atomic. + // A Monotonic store is OK if the query inst is itself not atomic. + // A Release (or higher) store further requires that no acquire load + // has been seen. // FIXME: This is overly conservative. if (!SI->isUnordered()) { if (!QueryInst) return MemDepResult::getClobber(SI); - if (SI->getOrdering() != Monotonic) - return MemDepResult::getClobber(SI); if (auto *QueryLI = dyn_cast(QueryInst)) if (!QueryLI->isSimple()) return MemDepResult::getClobber(SI); if (auto *QuerySI = dyn_cast(QueryInst)) if (!QuerySI->isSimple()) return MemDepResult::getClobber(SI); + if (HasSeenAcquire && isAtLeastRelease(SI->getOrdering())) + return MemDepResult::getClobber(SI); } // FIXME: this is overly conservative. Index: test/Transforms/DeadStoreElimination/atomic.ll =================================================================== --- test/Transforms/DeadStoreElimination/atomic.ll +++ test/Transforms/DeadStoreElimination/atomic.ll @@ -5,7 +5,7 @@ ; Sanity tests for atomic stores. ; Note that it turns out essentially every transformation DSE does is legal on -; atomic ops, just some transformations are not allowed across them. +; atomic ops, just some transformations are not allowed across release-acquire pairs. @x = common global i32 0, align 4 @y = common global i32 0, align 4 @@ -13,35 +13,32 @@ declare void @randomop(i32*) ; DSE across unordered store (allowed) -define void @test1() nounwind uwtable ssp { -; CHECK: test1 +define void @test1() { +; CHECK-LABEL: test1 ; CHECK-NOT: store i32 0 ; CHECK: store i32 1 -entry: store i32 0, i32* @x store atomic i32 0, i32* @y unordered, align 4 store i32 1, i32* @x ret void } -; DSE across seq_cst load (allowed in theory; not implemented ATM) -define i32 @test2() nounwind uwtable ssp { -; CHECK: test2 -; CHECK: store i32 0 +; DSE across seq_cst load (allowed) +define i32 @test2() { +; CHECK-LABEL: test2 +; CHECK-NOT: store i32 0 ; CHECK: store i32 1 -entry: store i32 0, i32* @x %x = load atomic i32* @y seq_cst, align 4 store i32 1, i32* @x ret i32 %x } -; DSE across seq_cst store (store before atomic store must not be removed) -define void @test3() nounwind uwtable ssp { -; CHECK: test3 -; CHECK: store i32 +; DSE across seq_cst store (allowed) +define void @test3() { +; CHECK-LABEL: test3 +; CHECK-NOT: store i32 0 ; CHECK: store atomic i32 2 -entry: store i32 0, i32* @x store atomic i32 2, i32* @y seq_cst, align 4 store i32 1, i32* @x @@ -49,32 +46,29 @@ } ; DSE remove unordered store (allowed) -define void @test4() nounwind uwtable ssp { -; CHECK: test4 +define void @test4() { +; CHECK-LABEL: test4 ; CHECK-NOT: store atomic ; CHECK: store i32 1 -entry: store atomic i32 0, i32* @x unordered, align 4 store i32 1, i32* @x ret void } ; DSE unordered store overwriting non-atomic store (allowed) -define void @test5() nounwind uwtable ssp { -; CHECK: test5 +define void @test5() { +; CHECK-LABEL: test5 ; CHECK: store atomic i32 1 -entry: store i32 0, i32* @x store atomic i32 1, i32* @x unordered, align 4 ret void } ; DSE no-op unordered atomic store (allowed) -define void @test6() nounwind uwtable ssp { -; CHECK: test6 +define void @test6() { +; CHECK-LABEL: test6 ; CHECK-NOT: store ; CHECK: ret void -entry: %x = load atomic i32* @x unordered, align 4 store atomic i32 %x, i32* @x unordered, align 4 ret void @@ -82,10 +76,9 @@ ; DSE seq_cst store (be conservative; DSE doesn't have infrastructure ; to reason about atomic operations). -define void @test7() nounwind uwtable ssp { -; CHECK: test7 -; CHECK: store atomic -entry: +define void @test7() { +; CHECK-LABEL: test7 +; CHECK: store atomic %a = alloca i32 store atomic i32 0, i32* %a seq_cst, align 4 ret void @@ -93,11 +86,10 @@ ; DSE and seq_cst load (be conservative; DSE doesn't have infrastructure ; to reason about atomic operations). -define i32 @test8() nounwind uwtable ssp { -; CHECK: test8 +define i32 @test8() { +; CHECK-LABEL: test8 ; CHECK: store -; CHECK: load atomic -entry: +; CHECK: load atomic %a = alloca i32 call void @randomop(i32* %a) store i32 0, i32* %a, align 4 @@ -106,11 +98,10 @@ } ; DSE across monotonic load (allowed as long as the eliminated store isUnordered) -define i32 @test9() nounwind uwtable ssp { -; CHECK: test9 +define i32 @test9() { +; CHECK-LABEL: test9 ; CHECK-NOT: store i32 0 ; CHECK: store i32 1 -entry: store i32 0, i32* @x %x = load atomic i32* @y monotonic, align 4 store i32 1, i32* @x @@ -118,11 +109,10 @@ } ; DSE across monotonic store (allowed as long as the eliminated store isUnordered) -define void @test10() nounwind uwtable ssp { -; CHECK: test10 +define void @test10() { +; CHECK-LABEL: test10 ; CHECK-NOT: store i32 0 ; CHECK: store i32 1 -entry: store i32 0, i32* @x store atomic i32 42, i32* @y monotonic, align 4 store i32 1, i32* @x @@ -130,11 +120,10 @@ } ; DSE across monotonic load (forbidden since the eliminated store is atomic) -define i32 @test11() nounwind uwtable ssp { -; CHECK: test11 +define i32 @test11() { +; CHECK-LABEL: test11 ; CHECK: store atomic i32 0 ; CHECK: store atomic i32 1 -entry: store atomic i32 0, i32* @x monotonic, align 4 %x = load atomic i32* @y monotonic, align 4 store atomic i32 1, i32* @x monotonic, align 4 @@ -142,13 +131,48 @@ } ; DSE across monotonic store (forbidden since the eliminated store is atomic) -define void @test12() nounwind uwtable ssp { -; CHECK: test12 +define void @test12() { +; CHECK-LABEL: test12 ; CHECK: store atomic i32 0 ; CHECK: store atomic i32 1 -entry: store atomic i32 0, i32* @x monotonic, align 4 store atomic i32 42, i32* @y monotonic, align 4 store atomic i32 1, i32* @x monotonic, align 4 ret void } + +; DSE is allowed across a pair of an atomic read and then write. +define i32 @test13() { +; CHECK-LABEL: test13 +; CHECK-NOT: store i32 0 +; CHECK: store i32 1 + store i32 0, i32* @x + %x = load atomic i32* @y seq_cst, align 4 + store atomic i32 %x, i32* @y seq_cst, align 4 + store i32 1, i32* @x + ret i32 %x +} + +; Same if it is acquire-release instead of seq_cst/seq_cst +define i32 @test14() { +; CHECK-LABEL: test14 +; CHECK-NOT: store i32 0 +; CHECK: store i32 1 + store i32 0, i32* @x + %x = load atomic i32* @y acquire, align 4 + store atomic i32 %x, i32* @y release, align 4 + store i32 1, i32* @x + ret i32 %x +} + +; But DSE is not allowed across a release-acquire pair. +define i32 @test15() { +; CHECK-LABEL: test15 +; CHECK: store i32 0 +; CHECK: store i32 1 + store i32 0, i32* @x + store atomic i32 0, i32* @y release, align 4 + %x = load atomic i32* @y acquire, align 4 + store i32 1, i32* @x + ret i32 %x +} Index: test/Transforms/GVN/atomic.ll =================================================================== --- test/Transforms/GVN/atomic.ll +++ test/Transforms/GVN/atomic.ll @@ -8,7 +8,7 @@ ; GVN across unordered store (allowed) define i32 @test1() nounwind uwtable ssp { -; CHECK: test1 +; CHECK-LABEL: test1 ; CHECK: add i32 %x, %x entry: %x = load i32* @y @@ -18,10 +18,10 @@ ret i32 %z } -; GVN across seq_cst store (allowed in theory; not implemented ATM) +; GVN across seq_cst store (allowed) define i32 @test2() nounwind uwtable ssp { -; CHECK: test2 -; CHECK: add i32 %x, %y +; CHECK-LABEL: test2 +; CHECK: add i32 %x, %x entry: %x = load i32* @y store atomic i32 %x, i32* @x seq_cst, align 4 @@ -32,7 +32,7 @@ ; GVN across unordered load (allowed) define i32 @test3() nounwind uwtable ssp { -; CHECK: test3 +; CHECK-LABEL: test3 ; CHECK: add i32 %x, %x entry: %x = load i32* @y @@ -43,11 +43,11 @@ ret i32 %b } -; GVN across acquire load (load after atomic load must not be removed) +; GVN across acquire load (allowed as the original load was not atomic) define i32 @test4() nounwind uwtable ssp { -; CHECK: test4 +; CHECK-LABEL: test4 ; CHECK: load atomic i32* @x -; CHECK: load i32* @y +; CHECK-NOT: load i32* @y entry: %x = load i32* @y %y = load atomic i32* @x seq_cst, align 4 @@ -59,7 +59,7 @@ ; GVN load to unordered load (allowed) define i32 @test5() nounwind uwtable ssp { -; CHECK: test5 +; CHECK-LABEL: test5 ; CHECK: add i32 %x, %x entry: %x = load atomic i32* @x unordered, align 4 @@ -70,7 +70,7 @@ ; GVN unordered load to load (unordered load must not be removed) define i32 @test6() nounwind uwtable ssp { -; CHECK: test6 +; CHECK-LABEL: test6 ; CHECK: load atomic i32* @x unordered entry: %x = load i32* @x @@ -79,9 +79,35 @@ ret i32 %x3 } -; GVN across monotonic store (allowed) +; GVN across release-acquire pair (forbidden) define i32 @test7() nounwind uwtable ssp { -; CHECK: test7 +; CHECK-LABEL: test7 +; CHECK: add i32 %x, %y +entry: + %x = load i32* @y + store atomic i32 %x, i32* @x release, align 4 + %w = load atomic i32* @x acquire, align 4 + %y = load i32* @y + %z = add i32 %x, %y + ret i32 %z +} + +; GVN across acquire-release pair (allowed) +define i32 @test8() nounwind uwtable ssp { +; CHECK-LABEL: test8 +; CHECK: add i32 %x, %x +entry: + %x = load i32* @y + %w = load atomic i32* @x acquire, align 4 + store atomic i32 %x, i32* @x release, align 4 + %y = load i32* @y + %z = add i32 %x, %y + ret i32 %z +} + +; GVN across monotonic store (allowed) +define i32 @test9() nounwind uwtable ssp { +; CHECK-LABEL: test9 ; CHECK: add i32 %x, %x entry: %x = load i32* @y @@ -92,8 +118,8 @@ } ; GVN of an unordered across monotonic load (not allowed) -define i32 @test8() nounwind uwtable ssp { -; CHECK: test8 +define i32 @test10() nounwind uwtable ssp { +; CHECK-LABEL: test10 ; CHECK: add i32 %x, %y entry: %x = load atomic i32* @y unordered, align 4 @@ -103,4 +129,3 @@ ret i32 %z } -