Index: llvm/trunk/lib/Analysis/MemoryDependenceAnalysis.cpp
===================================================================
--- llvm/trunk/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ llvm/trunk/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -638,6 +638,15 @@
     if (isInvariantLoad)
       continue;
 
+    // A release fence requires that all stores complete before it, but does
+    // not prevent the reordering of following loads or stores 'before' the
+    // fence.  As a result, we look past it when finding a dependency for
+    // loads.  DSE uses this to find preceeding stores to delete and thus we
+    // can't bypass the fence if the query instruction is a store.
+    if (FenceInst *FI = dyn_cast<FenceInst>(Inst))
+      if (isLoad && FI->getOrdering() == Release)
+        continue;
+    
     // See if this instruction (e.g. a call or vaarg) mod/ref's the pointer.
     ModRefInfo MR = AA.getModRefInfo(Inst, MemLoc);
     // If necessary, perform additional analysis.
Index: llvm/trunk/test/Transforms/DeadStoreElimination/fence.ll
===================================================================
--- llvm/trunk/test/Transforms/DeadStoreElimination/fence.ll
+++ llvm/trunk/test/Transforms/DeadStoreElimination/fence.ll
@@ -0,0 +1,48 @@
+; RUN: opt -S -basicaa -dse < %s | FileCheck %s
+
+; We conservative choose to prevent dead store elimination
+; across release or stronger fences.  It's not required 
+; (since the must still be a race on %addd.i), but
+; it is conservatively correct.  A legal optimization
+; could hoist the second store above the fence, and then
+; DSE one of them.
+define void @test1(i32* %addr.i) {
+; CHECK-LABEL: @test1
+; CHECK: store i32 5
+; CHECK: fence
+; CHECK: store i32 5
+; CHECK: ret
+  store i32 5, i32* %addr.i, align 4
+  fence release
+  store i32 5, i32* %addr.i, align 4
+  ret void
+}
+
+; Same as previous, but with different values.  If we ever optimize 
+; this more aggressively, this allows us to check that the correct
+; store is retained (the 'i32 1' store in this case)
+define void @test1b(i32* %addr.i) {
+; CHECK-LABEL: @test1b
+; CHECK: store i32 42
+; CHECK: fence release
+; CHECK: store i32 1
+; CHECK: ret
+  store i32 42, i32* %addr.i, align 4
+  fence release
+  store i32 1, i32* %addr.i, align 4
+  ret void
+}
+
+; We *could* DSE across this fence, but don't.  No other thread can
+; observe the order of the acquire fence and the store.
+define void @test2(i32* %addr.i) {
+; CHECK-LABEL: @test2
+; CHECK: store
+; CHECK: fence
+; CHECK: store
+; CHECK: ret
+  store i32 5, i32* %addr.i, align 4
+  fence acquire
+  store i32 5, i32* %addr.i, align 4
+  ret void
+}
Index: llvm/trunk/test/Transforms/GVN/fence.ll
===================================================================
--- llvm/trunk/test/Transforms/GVN/fence.ll
+++ llvm/trunk/test/Transforms/GVN/fence.ll
@@ -0,0 +1,69 @@
+; RUN: opt -S -basicaa -gvn < %s | FileCheck %s
+
+; We can value forward across the fence since we can (semantically) 
+; reorder the following load before the fence.
+define i32 @test(i32* %addr.i) {
+; CHECK-LABEL: @test
+; CHECK: store
+; CHECK: fence
+; CHECK-NOT: load
+; CHECK: ret
+  store i32 5, i32* %addr.i, align 4
+  fence release
+  %a = load i32, i32* %addr.i, align 4
+  ret i32 %a
+}
+
+; Same as above
+define i32 @test2(i32* %addr.i) {
+; CHECK-LABEL: @test2
+; CHECK-NEXT: fence
+; CHECK-NOT: load
+; CHECK: ret
+  %a = load i32, i32* %addr.i, align 4
+  fence release
+  %a2 = load i32, i32* %addr.i, align 4
+  %res = sub i32 %a, %a2
+  ret i32 %res
+}
+
+; We can not value forward across an acquire barrier since we might
+; be syncronizing with another thread storing to the same variable
+; followed by a release fence.  This is not so much enforcing an
+; ordering property (though it is that too), but a liveness 
+; property.  We expect to eventually see the value of store by
+; another thread when spinning on that location.  
+define i32 @test3(i32* noalias %addr.i, i32* noalias %otheraddr) {
+; CHECK-LABEL: @test3
+; CHECK: load
+; CHECK: fence
+; CHECK: load
+; CHECK: ret i32 %res
+  ; the following code is intented to model the unrolling of
+  ; two iterations in a spin loop of the form:
+  ;   do { fence acquire: tmp = *%addr.i; ) while (!tmp);
+  ; It's hopefully clear that allowing PRE to turn this into:
+  ;   if (!*%addr.i) while(true) {} would be unfortunate
+  fence acquire
+  %a = load i32, i32* %addr.i, align 4
+  fence acquire
+  %a2 = load i32, i32* %addr.i, align 4
+  %res = sub i32 %a, %a2
+  ret i32 %res
+}
+
+; Another example of why forwarding across an acquire fence is problematic
+; can be seen in a normal locking operation.  Say we had:
+; *p = 5; unlock(l); lock(l); use(p);
+; forwarding the store to p would be invalid.  A reasonable implementation
+; of unlock and lock might be:
+; unlock() { atomicrmw sub %l, 1 unordered; fence release }
+; lock() { 
+;   do {
+;     %res = cmpxchg %p, 0, 1, monotonic monotonic
+;   } while(!%res.success)
+;   fence acquire;
+; }
+; Given we chose to forward across the release fence, we clearly can't forward
+; across the acquire fence as well.
+