Index: lib/Analysis/MemoryDependenceAnalysis.cpp
===================================================================
--- lib/Analysis/MemoryDependenceAnalysis.cpp
+++ lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -370,6 +370,14 @@
   int64_t MemLocOffset = 0;
   unsigned Limit = BlockScanLimit;
   bool isInvariantLoad = false;
+
+  // Atomic accesses are only a problem if there is an acquire after a release.
+  // See "Compiler testing via a theory of sound optimisations in the C11/C++11
+  //   memory model" in PLDI 2013 for details.
+  // This paper does not prove this result for RMW accesses/fences, so we are
+  // conservative with them.
+  bool HasSeenAcquire = false;
+
   if (isLoad && QueryInst) {
     LoadInst *LI = dyn_cast<LoadInst>(QueryInst);
     if (LI && LI->getMetadata(LLVMContext::MD_invariant_load) != nullptr)
@@ -409,9 +417,9 @@
     // a load depends on another must aliased load from the same value.
     if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
       // Atomic loads have complications involved.
-      // FIXME: This is overly conservative.
-      if (!LI->isUnordered())
-        return MemDepResult::getClobber(LI);
+      // Monotonic accesses are not dangerous for this purpose.
+      if ((!LI->isUnordered()) && LI->getOrdering() != Monotonic)
+        HasSeenAcquire = true;
 
       AliasAnalysis::Location LoadLoc = AA->getLocation(LI);
 
@@ -469,8 +477,8 @@
 
     if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
       // Atomic stores have complications involved.
-      // FIXME: This is overly conservative.
-      if (!SI->isUnordered())
+      // Monotonic accesses are safe for this purpose.
+      if (HasSeenAcquire && (!SI->isUnordered()) && SI->getOrdering() != Monotonic)
         return MemDepResult::getClobber(SI);
 
       // If alias analysis can tell that this store is guaranteed to not modify
@@ -495,6 +503,25 @@
       return MemDepResult::getClobber(Inst);
     }
 
+    // FIXME We are probably too conservative here.
+    if (isa<FenceInst>(Inst)) {
+      return MemDepResult::getClobber(Inst);
+    }
+
+    // FIXME We are probably too conservative here.
+    if (auto RMWI = dyn_cast<AtomicRMWInst>(Inst)) {
+        if (RMWI->getOrdering() != Monotonic) {
+            return MemDepResult::getClobber(Inst);
+        }
+    }
+
+    // FIXME We are probably too conservative here.
+    if (auto CASI = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+        if (CASI->getSuccessOrdering() != Monotonic) {
+            return MemDepResult::getClobber(Inst);
+        }
+    }
+
     // If this is an allocation, and if we know that the accessed pointer is to
     // the allocation, return Def.  This means that there is no dependence and
     // the access can be optimized based on that.  For example, a load could
Index: test/Transforms/DeadStoreElimination/atomic.ll
===================================================================
--- test/Transforms/DeadStoreElimination/atomic.ll
+++ test/Transforms/DeadStoreElimination/atomic.ll
@@ -5,7 +5,7 @@
 
 ; Sanity tests for atomic stores.
 ; Note that it turns out essentially every transformation DSE does is legal on
-; atomic ops, just some transformations are not allowed across them. 
+; atomic ops, just some transformations are not allowed across them.
 
 @x = common global i32 0, align 4
 @y = common global i32 0, align 4
@@ -14,7 +14,7 @@
 
 ; DSE across unordered store (allowed)
 define void @test1()  nounwind uwtable ssp {
-; CHECK: test1
+; CHECK-LABEL: test1
 ; CHECK-NOT: store i32 0
 ; CHECK: store i32 1
 entry:
@@ -24,10 +24,10 @@
   ret void
 }
 
-; DSE across seq_cst load (allowed in theory; not implemented ATM)
+; DSE across seq_cst load
 define i32 @test2()  nounwind uwtable ssp {
-; CHECK: test2
-; CHECK: store i32 0
+; CHECK-LABEL: test2
+; CHECK-NOT: store i32 0
 ; CHECK: store i32 1
 entry:
   store i32 0, i32* @x
@@ -36,10 +36,10 @@
   ret i32 %x
 }
 
-; DSE across seq_cst store (store before atomic store must not be removed)
+; DSE across seq_cst store
 define void @test3()  nounwind uwtable ssp {
-; CHECK: test3
-; CHECK: store i32
+; CHECK-LABEL: test3
+; CHECK-NOT: store i32
 ; CHECK: store atomic i32 2
 entry:
   store i32 0, i32* @x
@@ -50,7 +50,7 @@
 
 ; DSE remove unordered store (allowed)
 define void @test4()  nounwind uwtable ssp {
-; CHECK: test4
+; CHECK-LABEL: test4
 ; CHECK-NOT: store atomic
 ; CHECK: store i32 1
 entry:
@@ -61,7 +61,8 @@
 
 ; DSE unordered store overwriting non-atomic store (allowed)
 define void @test5()  nounwind uwtable ssp {
-; CHECK: test5
+; CHECK-LABEL: test5
+; CHECK-NOT: store
 ; CHECK: store atomic i32 1
 entry:
   store i32 0, i32* @x
@@ -71,7 +72,7 @@
 
 ; DSE no-op unordered atomic store (allowed)
 define void @test6()  nounwind uwtable ssp {
-; CHECK: test6
+; CHECK-LABEL: test6
 ; CHECK-NOT: store
 ; CHECK: ret void
 entry:
@@ -83,8 +84,8 @@
 ; DSE seq_cst store (be conservative; DSE doesn't have infrastructure
 ; to reason about atomic operations).
 define void @test7()  nounwind uwtable ssp {
-; CHECK: test7
-; CHECK: store atomic 
+; CHECK-LABEL: test7
+; CHECK: store atomic
 entry:
   %a = alloca i32
   store atomic i32 0, i32* %a seq_cst, align 4
@@ -94,9 +95,9 @@
 ; DSE and seq_cst load (be conservative; DSE doesn't have infrastructure
 ; to reason about atomic operations).
 define i32 @test8()  nounwind uwtable ssp {
-; CHECK: test8
+; CHECK-LABEL: test8
 ; CHECK: store
-; CHECK: load atomic 
+; CHECK: load atomic
 entry:
   %a = alloca i32
   call void @randomop(i32* %a)
@@ -105,3 +106,108 @@
   ret i32 %x
 }
 
+; DSE should never eliminate an atomic write based only on a previous read
+; if the write is monotonic or stronger.
+define i32 @test9() nounwind uwtable ssp {
+; CHECK-LABEL: test9
+; CHECK: store atomic
+entry:
+  %x = load atomic i32* @x monotonic, align 4
+  store atomic i32 %x, i32 * @x monotonic, align 4
+  ret i32 %x
+}
+
+; DSE is allowed to operate across monotonic accesses.
+define i32 @test10()  nounwind uwtable ssp {
+; CHECK-LABEL: test10
+; CHECK-NOT: store i32 0
+; CHECK: store i32 1
+entry:
+  store i32 0, i32* @x
+  %x = load atomic i32* @y monotonic, align 4
+  store i32 1, i32* @x
+  ret i32 %x
+}
+
+; DSE is even allowed across a pair of an atomic read and then write.
+define i32 @test11()  nounwind uwtable ssp {
+; CHECK-LABEL: test11
+; CHECK-NOT: store i32 0
+; CHECK: store i32 1
+entry:
+  store i32 0, i32* @x
+  %x = load atomic i32* @y seq_cst, align 4
+  store atomic i32 %x, i32* @y seq_cst, align 4
+  store i32 1, i32* @x
+  ret i32 %x
+}
+
+; Atomic operations outside of the pair of considered accesses should be ignored.
+define i32 @test12()  nounwind uwtable ssp {
+; CHECK-LABEL: test12
+; CHECK-NOT: store i32 0
+; CHECK: store i32 1
+entry:
+  store atomic i32 0, i32 * @y seq_cst, align 4
+  store i32 0, i32* @x
+  %x = load atomic i32* @y seq_cst, align 4
+  store atomic i32 %x, i32* @y seq_cst, align 4
+  store i32 1, i32* @x
+  %y = load atomic i32* @y seq_cst, align 4
+  ret i32 %x
+}
+
+; But DSE is not allowed across a release-acquire pair.
+define i32 @test13()  nounwind uwtable ssp {
+; CHECK-LABEL: test13
+; CHECK: store i32 0
+; CHECK: store i32 1
+entry:
+  store i32 0, i32* @x
+  store atomic i32 0, i32* @y release, align 4
+  %x = load atomic i32* @y acquire, align 4
+  store i32 1, i32* @x
+  ret i32 %x
+}
+
+; And we are currently treating fences conservatively.
+define void @test14()  nounwind uwtable ssp {
+; CHECK-LABEL: test14
+; CHECK: store i32 0
+; CHECK: store i32 1
+entry:
+  store i32 0, i32* @x
+  fence seq_cst
+  store i32 1, i32* @x
+  ret void
+}
+
+; We are also treating RMW, and CAS conservatively if they are not monotonic.
+define void @test15()  nounwind uwtable ssp {
+; CHECK-LABEL: test15
+; CHECK: store i32 0
+; CHECK: store i32 1
+; CHECK: store i32 2
+entry:
+  store i32 0, i32* @x
+  %b = atomicrmw add i32* @y, i32 1 release
+  store i32 1, i32* @x
+  cmpxchg i32* @y, i32 42, i32 1 acquire monotonic
+  store i32 2, i32* @x
+  ret void
+}
+
+; But we still allow DSE across them if they are monotonic.
+define void @test16()  nounwind uwtable ssp {
+; CHECK-LABEL: test16
+; CHECK-NOT: store i32 0
+; CHECK-NOT: store i32 1
+; CHECK: store i32 2
+entry:
+  store i32 0, i32* @x
+  %b = atomicrmw add i32* @y, i32 1 monotonic
+  store i32 1, i32* @x
+  cmpxchg i32* @y, i32 42, i32 1 monotonic monotonic
+  store i32 2, i32* @x
+  ret void
+}
Index: test/Transforms/GVN/atomic.ll
===================================================================
--- test/Transforms/GVN/atomic.ll
+++ test/Transforms/GVN/atomic.ll
@@ -18,10 +18,10 @@
   ret i32 %z
 }
 
-; GVN across seq_cst store (allowed in theory; not implemented ATM)
+; GVN across seq_cst store
 define i32 @test2() nounwind uwtable ssp {
 ; CHECK: test2
-; CHECK: add i32 %x, %y
+; CHECK: add i32 %x, %x
 entry:
   %x = load i32* @y
   store atomic i32 %x, i32* @x seq_cst, align 4
@@ -43,11 +43,11 @@
   ret i32 %b
 }
 
-; GVN across acquire load (load after atomic load must not be removed)
+; GVN across acquire load
 define i32 @test4() nounwind uwtable ssp {
 ; CHECK: test4
 ; CHECK: load atomic i32* @x
-; CHECK: load i32* @y
+; CHECK-NOT: load i32* @y
 entry:
   %x = load i32* @y
   %y = load atomic i32* @x seq_cst, align 4