Index: lib/Analysis/MemoryDependenceAnalysis.cpp =================================================================== --- lib/Analysis/MemoryDependenceAnalysis.cpp +++ lib/Analysis/MemoryDependenceAnalysis.cpp @@ -370,6 +370,21 @@ int64_t MemLocOffset = 0; unsigned Limit = BlockScanLimit; bool isInvariantLoad = false; + + // We must be careful with atomic accesses, as they may allow another thread + // to touch this location, cloberring it. Based on the results of + // "Compiler testing via a theory of sound optimisations in the C11/C++11 + // memory model" in PLDI 2013, we know that this is only possible if there + // is a release access, followed by an acquire access. + // The general idea is that for a discriminating context (i.e. that accesses + // MemLoc) to not be racy, it must synchronize with both the access being + // optimized and the previous one, which requires respectively an acquire + // and a release on this thread. + // This paper does not prove this result for RMW accesses/fences, so we are + // conservative with them. They are detected as ModRef by the alias analysis, + // so this function returns Clobber on them automatically. + bool HasSeenAcquire = false; + if (isLoad && QueryInst) { LoadInst *LI = dyn_cast(QueryInst); if (LI && LI->getMetadata(LLVMContext::MD_invariant_load) != nullptr) @@ -409,9 +424,11 @@ // a load depends on another must aliased load from the same value. if (LoadInst *LI = dyn_cast(Inst)) { // Atomic loads have complications involved. - // FIXME: This is overly conservative. - if (!LI->isUnordered()) - return MemDepResult::getClobber(LI); + // Unordered/Monotonic accesses are not dangerous for this purpose, + // and a load cannot be Release or Acquire_Release. + if (LI->getOrdering() == Acquire + || LI->getOrdering() == SequentiallyConsistent) + HasSeenAcquire = true; AliasAnalysis::Location LoadLoc = AA->getLocation(LI); @@ -469,8 +486,11 @@ if (StoreInst *SI = dyn_cast(Inst)) { // Atomic stores have complications involved. - // FIXME: This is overly conservative. - if (!SI->isUnordered()) + // Unordered/Monotonic accesses are not dangerous for this purpose, + // and a store cannot be Acquire or Acquire_Release. + if (HasSeenAcquire && + (SI->getOrdering() == Release + || SI->getOrdering() == SequentiallyConsistent)) return MemDepResult::getClobber(SI); // If alias analysis can tell that this store is guaranteed to not modify Index: test/Transforms/DeadStoreElimination/atomic.ll =================================================================== --- test/Transforms/DeadStoreElimination/atomic.ll +++ test/Transforms/DeadStoreElimination/atomic.ll @@ -5,7 +5,7 @@ ; Sanity tests for atomic stores. ; Note that it turns out essentially every transformation DSE does is legal on -; atomic ops, just some transformations are not allowed across them. +; atomic ops, just some transformations are not allowed across them. @x = common global i32 0, align 4 @y = common global i32 0, align 4 @@ -14,7 +14,7 @@ ; DSE across unordered store (allowed) define void @test1() nounwind uwtable ssp { -; CHECK: test1 +; CHECK-LABEL: test1 ; CHECK-NOT: store i32 0 ; CHECK: store i32 1 entry: @@ -24,10 +24,10 @@ ret void } -; DSE across seq_cst load (allowed in theory; not implemented ATM) +; DSE across seq_cst load define i32 @test2() nounwind uwtable ssp { -; CHECK: test2 -; CHECK: store i32 0 +; CHECK-LABEL: test2 +; CHECK-NOT: store i32 0 ; CHECK: store i32 1 entry: store i32 0, i32* @x @@ -36,10 +36,10 @@ ret i32 %x } -; DSE across seq_cst store (store before atomic store must not be removed) +; DSE across seq_cst store define void @test3() nounwind uwtable ssp { -; CHECK: test3 -; CHECK: store i32 +; CHECK-LABEL: test3 +; CHECK-NOT: store i32 ; CHECK: store atomic i32 2 entry: store i32 0, i32* @x @@ -50,7 +50,7 @@ ; DSE remove unordered store (allowed) define void @test4() nounwind uwtable ssp { -; CHECK: test4 +; CHECK-LABEL: test4 ; CHECK-NOT: store atomic ; CHECK: store i32 1 entry: @@ -61,7 +61,8 @@ ; DSE unordered store overwriting non-atomic store (allowed) define void @test5() nounwind uwtable ssp { -; CHECK: test5 +; CHECK-LABEL: test5 +; CHECK-NOT: store ; CHECK: store atomic i32 1 entry: store i32 0, i32* @x @@ -71,7 +72,7 @@ ; DSE no-op unordered atomic store (allowed) define void @test6() nounwind uwtable ssp { -; CHECK: test6 +; CHECK-LABEL: test6 ; CHECK-NOT: store ; CHECK: ret void entry: @@ -83,8 +84,8 @@ ; DSE seq_cst store (be conservative; DSE doesn't have infrastructure ; to reason about atomic operations). define void @test7() nounwind uwtable ssp { -; CHECK: test7 -; CHECK: store atomic +; CHECK-LABEL: test7 +; CHECK: store atomic entry: %a = alloca i32 store atomic i32 0, i32* %a seq_cst, align 4 @@ -94,9 +95,9 @@ ; DSE and seq_cst load (be conservative; DSE doesn't have infrastructure ; to reason about atomic operations). define i32 @test8() nounwind uwtable ssp { -; CHECK: test8 +; CHECK-LABEL: test8 ; CHECK: store -; CHECK: load atomic +; CHECK: load atomic entry: %a = alloca i32 call void @randomop(i32* %a) @@ -105,3 +106,108 @@ ret i32 %x } +; DSE should never eliminate an atomic write based only on a previous read +; if the write is monotonic or stronger. +define i32 @test9() nounwind uwtable ssp { +; CHECK-LABEL: test9 +; CHECK: store atomic +entry: + %x = load atomic i32* @x monotonic, align 4 + store atomic i32 %x, i32 * @x monotonic, align 4 + ret i32 %x +} + +; DSE is allowed to operate across monotonic accesses. +define i32 @test10() nounwind uwtable ssp { +; CHECK-LABEL: test10 +; CHECK-NOT: store i32 0 +; CHECK: store i32 1 +entry: + store i32 0, i32* @x + %x = load atomic i32* @y monotonic, align 4 + store i32 1, i32* @x + ret i32 %x +} + +; DSE is even allowed across a pair of an atomic read and then write. +define i32 @test11() nounwind uwtable ssp { +; CHECK-LABEL: test11 +; CHECK-NOT: store i32 0 +; CHECK: store i32 1 +entry: + store i32 0, i32* @x + %x = load atomic i32* @y seq_cst, align 4 + store atomic i32 %x, i32* @y seq_cst, align 4 + store i32 1, i32* @x + ret i32 %x +} + +; Atomic operations outside of the pair of considered accesses should be ignored. +define i32 @test12() nounwind uwtable ssp { +; CHECK-LABEL: test12 +; CHECK-NOT: store i32 0 +; CHECK: store i32 1 +entry: + store atomic i32 0, i32 * @y seq_cst, align 4 + store i32 0, i32* @x + %x = load atomic i32* @y seq_cst, align 4 + store atomic i32 %x, i32* @y seq_cst, align 4 + store i32 1, i32* @x + %y = load atomic i32* @y seq_cst, align 4 + ret i32 %x +} + +; But DSE is not allowed across a release-acquire pair. +define i32 @test13() nounwind uwtable ssp { +; CHECK-LABEL: test13 +; CHECK: store i32 0 +; CHECK: store i32 1 +entry: + store i32 0, i32* @x + store atomic i32 0, i32* @y release, align 4 + %x = load atomic i32* @y acquire, align 4 + store i32 1, i32* @x + ret i32 %x +} + +; And we are currently treating fences conservatively. +define void @test14() nounwind uwtable ssp { +; CHECK-LABEL: test14 +; CHECK: store i32 0 +; CHECK: store i32 1 +entry: + store i32 0, i32* @x + fence seq_cst + store i32 1, i32* @x + ret void +} + +; We are also treating RMW, and CAS conservatively if they are not monotonic. +define void @test15() nounwind uwtable ssp { +; CHECK-LABEL: test15 +; CHECK: store i32 0 +; CHECK: store i32 1 +; CHECK: store i32 2 +entry: + store i32 0, i32* @x + %b = atomicrmw add i32* @y, i32 1 release + store i32 1, i32* @x + cmpxchg i32* @y, i32 42, i32 1 acquire monotonic + store i32 2, i32* @x + ret void +} + +; But we still allow DSE across them if they are monotonic. +define void @test16() nounwind uwtable ssp { +; CHECK-LABEL: test16 +; CHECK-NOT: store i32 0 +; CHECK-NOT: store i32 1 +; CHECK: store i32 2 +entry: + store i32 0, i32* @x + %b = atomicrmw add i32* @y, i32 1 monotonic + store i32 1, i32* @x + cmpxchg i32* @y, i32 42, i32 1 monotonic monotonic + store i32 2, i32* @x + ret void +} Index: test/Transforms/GVN/atomic.ll =================================================================== --- test/Transforms/GVN/atomic.ll +++ test/Transforms/GVN/atomic.ll @@ -18,10 +18,10 @@ ret i32 %z } -; GVN across seq_cst store (allowed in theory; not implemented ATM) +; GVN across seq_cst store define i32 @test2() nounwind uwtable ssp { ; CHECK: test2 -; CHECK: add i32 %x, %y +; CHECK: add i32 %x, %x entry: %x = load i32* @y store atomic i32 %x, i32* @x seq_cst, align 4 @@ -43,11 +43,11 @@ ret i32 %b } -; GVN across acquire load (load after atomic load must not be removed) +; GVN across acquire load define i32 @test4() nounwind uwtable ssp { ; CHECK: test4 ; CHECK: load atomic i32* @x -; CHECK: load i32* @y +; CHECK-NOT: load i32* @y entry: %x = load i32* @y %y = load atomic i32* @x seq_cst, align 4