Index: llvm/trunk/lib/Transforms/Scalar/GVN.cpp
===================================================================
--- llvm/trunk/lib/Transforms/Scalar/GVN.cpp
+++ llvm/trunk/lib/Transforms/Scalar/GVN.cpp
@@ -1219,6 +1219,7 @@
 
   assert((DepInfo.isDef() || DepInfo.isClobber()) &&
          "expected a local dependence");
+  assert(LI->isUnordered() && "rules below are incorrect for ordered access");
 
   const DataLayout &DL = LI->getModule()->getDataLayout();
 
@@ -1227,7 +1228,8 @@
     // read by the load, we can extract the bits we need for the load from the
     // stored value.
     if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInfo.getInst())) {
-      if (Address) {
+      // Can't forward from non-atomic to atomic without violating memory model.
+      if (Address && LI->isAtomic() <= DepSI->isAtomic()) {
         int Offset =
           AnalyzeLoadFromClobberingStore(LI->getType(), Address, DepSI);
         if (Offset != -1) {
@@ -1244,7 +1246,8 @@
     if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInfo.getInst())) {
       // If this is a clobber and L is the first instruction in its block, then
       // we have the first instruction in the entry block.
-      if (DepLI != LI && Address) {
+      // Can't forward from non-atomic to atomic without violating memory model.
+      if (DepLI != LI && Address && LI->isAtomic() <= DepLI->isAtomic()) {
         int Offset =
           AnalyzeLoadFromClobberingLoad(LI->getType(), Address, DepLI, DL);
 
@@ -1258,7 +1261,7 @@
     // If the clobbering value is a memset/memcpy/memmove, see if we can
     // forward a value on from it.
     if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInfo.getInst())) {
-      if (Address) {
+      if (Address && !LI->isAtomic()) {
         int Offset = AnalyzeLoadFromClobberingMemInst(LI->getType(), Address,
                                                       DepMI, DL);
         if (Offset != -1) {
@@ -1304,6 +1307,10 @@
                                          LI->getType(), DL))
       return false;
 
+    // Can't forward from non-atomic to atomic without violating memory model.
+    if (S->isAtomic() < LI->isAtomic())
+      return false;
+
     Res = AvailableValue::get(S->getValueOperand());
     return true;
   }
@@ -1316,6 +1323,10 @@
         !CanCoerceMustAliasedValueToLoad(LD, LI->getType(), DL))
       return false;
 
+    // Can't forward from non-atomic to atomic without violating memory model.
+    if (LD->isAtomic() < LI->isAtomic())
+      return false;
+
     Res = AvailableValue::getLoad(LD);
     return true;
   }
@@ -1587,6 +1598,11 @@
   if (LI->getParent()->getParent()->hasFnAttribute(Attribute::SanitizeAddress))
     return false;
 
+  // This code hasn't been audited for atomic, ordered, or volatile memory
+  // access.
+  if (!LI->isSimple())
+    return false;
+
   // Step 1: Find the non-local dependencies of the load.
   LoadDepVect Deps;
   MD->getNonLocalPointerDependency(LI, Deps);
@@ -1755,7 +1771,8 @@
   if (!MD)
     return false;
 
-  if (!L->isSimple())
+  // This code hasn't been audited for ordered or volatile memory access
+  if (!L->isUnordered())
     return false;
 
   if (L->use_empty()) {
Index: llvm/trunk/test/Transforms/GVN/atomic.ll
===================================================================
--- llvm/trunk/test/Transforms/GVN/atomic.ll
+++ llvm/trunk/test/Transforms/GVN/atomic.ll
@@ -107,3 +107,233 @@
 ; CHECK: load i32, i32* @y, align 4
   ret i32 %load
 }
+
+; CHECK-LABEL: @test12(
+; Can't remove a load over a ordering barrier
+define i32 @test12(i1 %B, i32* %P1, i32* %P2) {
+  %load0 = load i32, i32* %P1
+  %1 = load atomic i32, i32* %P2 seq_cst, align 4
+  %load1 = load i32, i32* %P1
+  %sel = select i1 %B, i32 %load0, i32 %load1
+  ret i32 %sel
+  ; CHECK: load i32, i32* %P1
+  ; CHECK: load i32, i32* %P1
+}
+
+; CHECK-LABEL: @test13(
+; atomic to non-atomic forwarding is legal
+define i32 @test13(i32* %P1) {
+  %a = load atomic i32, i32* %P1 seq_cst, align 4
+  %b = load i32, i32* %P1
+  %res = sub i32 %a, %b
+  ret i32 %res
+  ; CHECK: load atomic i32, i32* %P1
+  ; CHECK: ret i32 0
+}
+
+; CHECK-LABEL: @test13b(
+define i32 @test13b(i32* %P1) {
+  store  atomic i32 0, i32* %P1 unordered, align 4
+  %b = load i32, i32* %P1
+  ret i32 %b
+  ; CHECK: ret i32 0
+}
+
+; CHECK-LABEL: @test14(
+; atomic to unordered atomic forwarding is legal
+define i32 @test14(i32* %P1) {
+  %a = load atomic i32, i32* %P1 seq_cst, align 4
+  %b = load atomic i32, i32* %P1 unordered, align 4
+  %res = sub i32 %a, %b
+  ret i32 %res
+  ; CHECK: load atomic i32, i32* %P1 seq_cst
+  ; CHECK-NEXT: ret i32 0
+}
+
+; CHECK-LABEL: @test15(
+; implementation restriction: can't forward to stonger
+; than unordered
+define i32 @test15(i32* %P1, i32* %P2) {
+  %a = load atomic i32, i32* %P1 seq_cst, align 4
+  %b = load atomic i32, i32* %P1 seq_cst, align 4
+  %res = sub i32 %a, %b
+  ret i32 %res
+  ; CHECK: load atomic i32, i32* %P1
+  ; CHECK: load atomic i32, i32* %P1
+}
+
+; CHECK-LABEL: @test16(
+; forwarding non-atomic to atomic is wrong! (However,
+; it would be legal to use the later value in place of the
+; former in this particular example.  We just don't
+; do that right now.)
+define i32 @test16(i32* %P1, i32* %P2) {
+  %a = load i32, i32* %P1, align 4
+  %b = load atomic i32, i32* %P1 unordered, align 4
+  %res = sub i32 %a, %b
+  ret i32 %res
+  ; CHECK: load i32, i32* %P1
+  ; CHECK: load atomic i32, i32* %P1
+}
+
+; CHECK-LABEL: @test16b(
+define i32 @test16b(i32* %P1) {
+  store i32 0, i32* %P1
+  %b = load atomic i32, i32* %P1 unordered, align 4
+  ret i32 %b
+  ; CHECK: load atomic i32, i32* %P1
+}
+
+; Can't DSE across a full fence
+define void @fence_seq_cst_store(i32* %P1, i32* %P2) {
+; CHECK-LABEL: @fence_seq_cst_store(
+; CHECK: store
+; CHECK: store atomic
+; CHECK: store
+  store i32 0, i32* %P1, align 4
+  store atomic i32 0, i32* %P2 seq_cst, align 4
+  store i32 0, i32* %P1, align 4
+  ret void
+}
+
+; Can't DSE across a full fence
+define void @fence_seq_cst(i32* %P1, i32* %P2) {
+; CHECK-LABEL: @fence_seq_cst(
+; CHECK: store
+; CHECK: fence seq_cst
+; CHECK: store
+  store i32 0, i32* %P1, align 4
+  fence seq_cst
+  store i32 0, i32* %P1, align 4
+  ret void
+}
+
+; Can't DSE across a full singlethread fence
+define void @fence_seq_cst_st(i32* %P1, i32* %P2) {
+; CHECK-LABEL: @fence_seq_cst_st(
+; CHECK: store
+; CHECK: fence singlethread seq_cst
+; CHECK: store
+  store i32 0, i32* %P1, align 4
+  fence singlethread seq_cst
+  store i32 0, i32* %P1, align 4
+  ret void
+}
+
+; Can't DSE across a full fence
+define void @fence_asm_sideeffect(i32* %P1, i32* %P2) {
+; CHECK-LABEL: @fence_asm_sideeffect(
+; CHECK: store
+; CHECK: call void asm sideeffect
+; CHECK: store
+  store i32 0, i32* %P1, align 4
+  call void asm sideeffect "", ""()
+  store i32 0, i32* %P1, align 4
+  ret void
+}
+
+; Can't DSE across a full fence
+define void @fence_asm_memory(i32* %P1, i32* %P2) {
+; CHECK-LABEL: @fence_asm_memory(
+; CHECK: store
+; CHECK: call void asm
+; CHECK: store
+  store i32 0, i32* %P1, align 4
+  call void asm "", "~{memory}"()
+  store i32 0, i32* %P1, align 4
+  ret void
+}
+
+; Can't remove a volatile load
+define i32 @volatile_load(i32* %P1, i32* %P2) {
+  %a = load i32, i32* %P1, align 4
+  %b = load volatile i32, i32* %P1, align 4
+  %res = sub i32 %a, %b
+  ret i32 %res
+  ; CHECK-LABEL: @volatile_load(
+  ; CHECK: load i32, i32* %P1
+  ; CHECK: load volatile i32, i32* %P1
+}
+
+; Can't remove redundant volatile loads
+define i32 @redundant_volatile_load(i32* %P1, i32* %P2) {
+  %a = load volatile i32, i32* %P1, align 4
+  %b = load volatile i32, i32* %P1, align 4
+  %res = sub i32 %a, %b
+  ret i32 %res
+  ; CHECK-LABEL: @redundant_volatile_load(
+  ; CHECK: load volatile i32, i32* %P1
+  ; CHECK: load volatile i32, i32* %P1
+  ; CHECK: sub
+}
+
+; Can't DSE a volatile store
+define void @volatile_store(i32* %P1, i32* %P2) {
+; CHECK-LABEL: @volatile_store(
+; CHECK: store volatile
+; CHECK: store
+  store volatile i32 0, i32* %P1, align 4
+  store i32 3, i32* %P1, align 4
+  ret void
+}
+
+; Can't DSE a redundant volatile store
+define void @redundant_volatile_store(i32* %P1, i32* %P2) {
+; CHECK-LABEL: @redundant_volatile_store(
+; CHECK: store volatile
+; CHECK: store volatile
+  store volatile i32 0, i32* %P1, align 4
+  store volatile i32 0, i32* %P1, align 4
+  ret void
+}
+
+; Can value forward from volatiles
+define i32 @test20(i32* %P1, i32* %P2) {
+  %a = load volatile i32, i32* %P1, align 4
+  %b = load i32, i32* %P1, align 4
+  %res = sub i32 %a, %b
+  ret i32 %res
+  ; CHECK-LABEL: @test20(
+  ; CHECK: load volatile i32, i32* %P1
+  ; CHECK: ret i32 0
+}
+
+; We're currently conservative about widening
+define i64 @widen1(i32* %P1) {
+  ; CHECK-LABEL: @widen1(
+  ; CHECK: load atomic i32, i32* %P1
+  ; CHECK: load atomic i64, i64* %p2
+  %p2 = bitcast i32* %P1 to i64*
+  %a = load atomic i32, i32* %P1 unordered, align 4
+  %b = load atomic i64, i64* %p2 unordered, align 4
+  %a64 = sext i32 %a to i64
+  %res = sub i64 %a64, %b
+  ret i64 %res
+}
+
+; narrowing does work
+define i64 @narrow(i32* %P1) {
+  ; CHECK-LABEL: @narrow(
+  ; CHECK: load atomic i64, i64* %p2
+  ; CHECK-NOT: load atomic i32, i32* %P1
+  %p2 = bitcast i32* %P1 to i64*
+  %a64 = load atomic i64, i64* %p2 unordered, align 4
+  %b = load atomic i32, i32* %P1 unordered, align 4
+  %b64 = sext i32 %b to i64
+  %res = sub i64 %a64, %b64
+  ret i64 %res
+}
+
+; Missed optimization, we don't yet optimize ordered loads
+define i64 @narrow2(i32* %P1) {
+  ; CHECK-LABEL: @narrow2(
+  ; CHECK: load atomic i64, i64* %p2
+  ; CHECK: load atomic i32, i32* %P1
+  %p2 = bitcast i32* %P1 to i64*
+  %a64 = load atomic i64, i64* %p2 acquire, align 4
+  %b = load atomic i32, i32* %P1 acquire, align 4
+  %b64 = sext i32 %b to i64
+  %res = sub i64 %a64, %b64
+  ret i64 %res
+}
+