Index: llvm/trunk/lib/Analysis/MemoryDependenceAnalysis.cpp =================================================================== --- llvm/trunk/lib/Analysis/MemoryDependenceAnalysis.cpp +++ llvm/trunk/lib/Analysis/MemoryDependenceAnalysis.cpp @@ -638,6 +638,15 @@ if (isInvariantLoad) continue; + // A release fence requires that all stores complete before it, but does + // not prevent the reordering of following loads or stores 'before' the + // fence. As a result, we look past it when finding a dependency for + // loads. DSE uses this to find preceeding stores to delete and thus we + // can't bypass the fence if the query instruction is a store. + if (FenceInst *FI = dyn_cast(Inst)) + if (isLoad && FI->getOrdering() == Release) + continue; + // See if this instruction (e.g. a call or vaarg) mod/ref's the pointer. ModRefInfo MR = AA.getModRefInfo(Inst, MemLoc); // If necessary, perform additional analysis. Index: llvm/trunk/test/Transforms/DeadStoreElimination/fence.ll =================================================================== --- llvm/trunk/test/Transforms/DeadStoreElimination/fence.ll +++ llvm/trunk/test/Transforms/DeadStoreElimination/fence.ll @@ -0,0 +1,48 @@ +; RUN: opt -S -basicaa -dse < %s | FileCheck %s + +; We conservative choose to prevent dead store elimination +; across release or stronger fences. It's not required +; (since the must still be a race on %addd.i), but +; it is conservatively correct. A legal optimization +; could hoist the second store above the fence, and then +; DSE one of them. +define void @test1(i32* %addr.i) { +; CHECK-LABEL: @test1 +; CHECK: store i32 5 +; CHECK: fence +; CHECK: store i32 5 +; CHECK: ret + store i32 5, i32* %addr.i, align 4 + fence release + store i32 5, i32* %addr.i, align 4 + ret void +} + +; Same as previous, but with different values. If we ever optimize +; this more aggressively, this allows us to check that the correct +; store is retained (the 'i32 1' store in this case) +define void @test1b(i32* %addr.i) { +; CHECK-LABEL: @test1b +; CHECK: store i32 42 +; CHECK: fence release +; CHECK: store i32 1 +; CHECK: ret + store i32 42, i32* %addr.i, align 4 + fence release + store i32 1, i32* %addr.i, align 4 + ret void +} + +; We *could* DSE across this fence, but don't. No other thread can +; observe the order of the acquire fence and the store. +define void @test2(i32* %addr.i) { +; CHECK-LABEL: @test2 +; CHECK: store +; CHECK: fence +; CHECK: store +; CHECK: ret + store i32 5, i32* %addr.i, align 4 + fence acquire + store i32 5, i32* %addr.i, align 4 + ret void +} Index: llvm/trunk/test/Transforms/GVN/fence.ll =================================================================== --- llvm/trunk/test/Transforms/GVN/fence.ll +++ llvm/trunk/test/Transforms/GVN/fence.ll @@ -0,0 +1,69 @@ +; RUN: opt -S -basicaa -gvn < %s | FileCheck %s + +; We can value forward across the fence since we can (semantically) +; reorder the following load before the fence. +define i32 @test(i32* %addr.i) { +; CHECK-LABEL: @test +; CHECK: store +; CHECK: fence +; CHECK-NOT: load +; CHECK: ret + store i32 5, i32* %addr.i, align 4 + fence release + %a = load i32, i32* %addr.i, align 4 + ret i32 %a +} + +; Same as above +define i32 @test2(i32* %addr.i) { +; CHECK-LABEL: @test2 +; CHECK-NEXT: fence +; CHECK-NOT: load +; CHECK: ret + %a = load i32, i32* %addr.i, align 4 + fence release + %a2 = load i32, i32* %addr.i, align 4 + %res = sub i32 %a, %a2 + ret i32 %res +} + +; We can not value forward across an acquire barrier since we might +; be syncronizing with another thread storing to the same variable +; followed by a release fence. This is not so much enforcing an +; ordering property (though it is that too), but a liveness +; property. We expect to eventually see the value of store by +; another thread when spinning on that location. +define i32 @test3(i32* noalias %addr.i, i32* noalias %otheraddr) { +; CHECK-LABEL: @test3 +; CHECK: load +; CHECK: fence +; CHECK: load +; CHECK: ret i32 %res + ; the following code is intented to model the unrolling of + ; two iterations in a spin loop of the form: + ; do { fence acquire: tmp = *%addr.i; ) while (!tmp); + ; It's hopefully clear that allowing PRE to turn this into: + ; if (!*%addr.i) while(true) {} would be unfortunate + fence acquire + %a = load i32, i32* %addr.i, align 4 + fence acquire + %a2 = load i32, i32* %addr.i, align 4 + %res = sub i32 %a, %a2 + ret i32 %res +} + +; Another example of why forwarding across an acquire fence is problematic +; can be seen in a normal locking operation. Say we had: +; *p = 5; unlock(l); lock(l); use(p); +; forwarding the store to p would be invalid. A reasonable implementation +; of unlock and lock might be: +; unlock() { atomicrmw sub %l, 1 unordered; fence release } +; lock() { +; do { +; %res = cmpxchg %p, 0, 1, monotonic monotonic +; } while(!%res.success) +; fence acquire; +; } +; Given we chose to forward across the release fence, we clearly can't forward +; across the acquire fence as well. +