diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -1821,6 +1821,58 @@
   }
 };
 
+/// \returns true if \p KillingDef stores the result of \p Load to the source of
+/// \p Load.
+static bool storeIsNoop(MemorySSA &MSSA, LoadInst *Load,
+                        MemoryDef *KillingDef) {
+  Instruction *Store = KillingDef->getMemoryInst();
+  // If the load's operand isn't the destination of the store, bail.
+  if (Load->getPointerOperand() != Store->getOperand(1))
+    return false;
+
+  // Get the defining access for the load.
+  auto *LoadAccess = MSSA.getMemoryAccess(Load)->getDefiningAccess();
+  // Fast path: the defining accesses are the same.
+  bool IsDead = LoadAccess == KillingDef->getDefiningAccess();
+  if (IsDead)
+    return true;
+
+  // Look through phi accesses. Recursively scan all phi accesses by adding them
+  // to a worklist. Bail when we run into a memory def.
+  SetVector<MemoryAccess *> ToCheck;
+  MemoryAccess *Current = KillingDef->getDefiningAccess();
+  // We don't want to bail when we run into the store memory def. But, the phi
+  // access may point to it. So, pretend like we've already checked it.
+  ToCheck.insert(KillingDef);
+  ToCheck.insert(Current);
+  // Start at current (1) to simulate already having checked KillingDef.
+  for (unsigned I = 1; I < ToCheck.size(); ++I) {
+    Current = ToCheck[I];
+    if (auto PhiAccess = dyn_cast<MemoryPhi>(Current)) {
+      // Check all the operands.
+      for (unsigned PhiIndex = 0; PhiIndex < PhiAccess->getNumOperands();
+           ++PhiIndex)
+        ToCheck.insert(PhiAccess->getOperand(PhiIndex));
+      continue;
+    }
+
+    // If we found a memory def, bail. This happens when we have an unrelated
+    // write in between an otherwise noop store.
+    if (isa<MemoryDef>(Current)) {
+      // We are searching for the definition of the store's destination. So, if
+      // that is the same definition as the load, then this is a noop.
+      IsDead = LoadAccess == Current;
+      break;
+    }
+
+    // The store is also a noop if the defining access of a use of the store's
+    // memory is the same as the load's defining access.
+    if (cast<MemoryUse>(Current)->getDefiningAccess() == LoadAccess)
+      IsDead = true;
+  }
+  return IsDead;
+}
+
 bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
                                   MemorySSA &MSSA, DominatorTree &DT,
                                   PostDominatorTree &PDT,
@@ -1835,6 +1887,17 @@
     if (State.SkipStores.count(KillingDef))
       continue;
     Instruction *SI = KillingDef->getMemoryInst();
+
+    // Check if we're storing a value that we just loaded.
+    if (auto *Load = dyn_cast<LoadInst>(SI->getOperand(0))) {
+      if (storeIsNoop(MSSA, Load, KillingDef)) {
+        State.deleteDeadInstruction(SI);
+        LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: " << *SI
+                          << '\n');
+        break;
+      }
+    }
+
     auto MaybeSILoc = State.getLocForWriteEx(SI);
     if (!MaybeSILoc) {
       LLVM_DEBUG(dbgs() << "Failed to find analyzable write location for "
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/atomic.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/atomic.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/atomic.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/atomic.ll
@@ -98,3 +98,37 @@
   store i32 1, i32* @x
   ret i32 %x
 }
+
+; **** Noop load->store tests **************************************************
+
+; We can optimize unordered atomic loads or stores.
+define void @test_load_atomic(i32* %Q) {
+; CHECK-LABEL: @test_load_atomic(
+; CHECK-NEXT:    ret void
+;
+  %a = load atomic i32, i32* %Q unordered, align 4
+  store atomic i32 %a, i32* %Q unordered, align 4
+  ret void
+}
+
+; We can optimize unordered atomic loads or stores.
+define void @test_store_atomic(i32* %Q) {
+; CHECK-LABEL: @test_store_atomic(
+; CHECK-NEXT:    ret void
+;
+  %a = load i32, i32* %Q
+  store atomic i32 %a, i32* %Q unordered, align 4
+  ret void
+}
+
+; We can NOT optimize release atomic loads or stores.
+define void @test_store_atomic_release(i32* %Q) {
+; CHECK-LABEL: @test_store_atomic_release(
+; CHECK-NEXT:    load
+; CHECK-NEXT:    store atomic
+; CHECK-NEXT:    ret void
+;
+  %a = load i32, i32* %Q
+  store atomic i32 %a, i32* %Q release, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll
@@ -25,15 +25,13 @@
 
 define i32 @test3(i32* %g_addr) nounwind {
 ; CHECK-LABEL: @test3(
-; CHECK-NEXT:    [[G_VALUE:%.*]] = load i32, i32* [[G_ADDR:%.*]], align 4
 ; CHECK-NEXT:    store i32 -1, i32* @g, align 4
-; CHECK-NEXT:    store i32 [[G_VALUE]], i32* [[G_ADDR]], align 4
+; CHECK-NEXT:    store i32 1, i32* %g_addr, align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* @g, align 4
 ; CHECK-NEXT:    ret i32 [[TMP3]]
 ;
-  %g_value = load i32, i32* %g_addr, align 4
   store i32 -1, i32* @g, align 4
-  store i32 %g_value, i32* %g_addr, align 4
+  store i32 1, i32* %g_addr, align 4
   %tmp3 = load i32, i32* @g, align 4
   ret i32 %tmp3
 }
@@ -268,8 +266,8 @@
 ; CHECK:       bb2:
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    store i32 [[V]], i32* [[P]], align 4
-; CHECK-NEXT:    ret i32 0
+; CHECK-NEXT:    store i32 [[I]], i32* [[P]], align 4
+; CHECK-NEXT:    ret i32 [[V]]
 ;
 entry:
   %v = load i32, i32* %p, align 4
@@ -282,8 +280,8 @@
 bb2:
   br label %bb3
 bb3:
-  store i32 %v, i32* %p, align 4
-  ret i32 0
+  store i32 %i, i32* %p, align 4
+  ret i32 %v
 }
 
 ; Don't remove redundant store because of may-aliased store.
@@ -298,8 +296,8 @@
 ; CHECK-NEXT:    store i32 [[I:%.*]], i32* [[P2:%.*]], align 4
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    store i32 [[V]], i32* [[P]], align 4
-; CHECK-NEXT:    ret i32 0
+; CHECK-NEXT:    store i32 [[I]], i32* [[P]], align 4
+; CHECK-NEXT:    ret i32 [[V]]
 ;
 entry:
   %v = load i32, i32* %p, align 4
@@ -311,8 +309,8 @@
   store i32 %i, i32* %p2, align 4
   br label %bb3
 bb3:
-  store i32 %v, i32* %p, align 4
-  ret i32 0
+  store i32 %i, i32* %p, align 4
+  ret i32 %v
 }
 
 declare void @unknown_func()
@@ -329,8 +327,8 @@
 ; CHECK-NEXT:    call void @unknown_func()
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    store i32 [[V]], i32* [[P]], align 4
-; CHECK-NEXT:    ret i32 0
+; CHECK-NEXT:    store i32 [[I]], i32* [[P]], align 4
+; CHECK-NEXT:    ret i32 [[V]]
 ;
 entry:
   %v = load i32, i32* %p, align 4
@@ -342,8 +340,8 @@
   call void @unknown_func()
   br label %bb3
 bb3:
-  store i32 %v, i32* %p, align 4
-  ret i32 0
+  store i32 %i, i32* %p, align 4
+  ret i32 %v
 }
 
 ; Don't remove redundant store in a loop with a may-alias store.
@@ -353,22 +351,22 @@
 ; CHECK-NEXT:    [[V:%.*]] = load i32, i32* [[P:%.*]], align 4
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    store i32 [[V]], i32* [[P]], align 4
+; CHECK-NEXT:    store i32 [[I]], i32* [[P]], align 4
 ; CHECK-NEXT:    call void @unknown_func()
 ; CHECK-NEXT:    br i1 undef, label [[BB1]], label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    ret i32 0
+; CHECK-NEXT:    ret i32 [[V]]
 ;
 entry:
   %v = load i32, i32* %p, align 4
   br label %bb1
 bb1:
-  store i32 %v, i32* %p, align 4
+  store i32 %i, i32* %p, align 4
   ; Might read and overwrite value at %p
   call void @unknown_func()
   br i1 undef, label %bb1, label %bb2
 bb2:
-  ret i32 0
+  ret i32 %v
 }
 
 ; TODO
@@ -579,3 +577,181 @@
   store atomic i8 3, i8* %P2 unordered, align 4
   ret void
 }
+
+; **** Noop load->store tests **************************************************
+
+; We CAN optimize volatile loads.
+define void @test_load_volatile(i32* %Q) {
+; CHECK-LABEL: @test_load_volatile(
+; CHECK-NEXT:    [[A:%.*]] = load volatile i32, i32* [[Q:%.*]]
+; CHECK-NEXT:    store i32 [[A]], i32* [[Q]]
+; CHECK-NEXT:    ret void
+;
+  %a = load volatile i32, i32* %Q
+  store i32 %a, i32* %Q
+  ret void
+}
+
+; We can NOT optimize volatile stores.
+define void @test_store_volatile(i32* %Q) {
+; CHECK-LABEL: @test_store_volatile(
+; CHECK-NEXT:    [[A:%.*]] = load i32, i32* [[Q:%.*]]
+; CHECK-NEXT:    store volatile i32 [[A]]
+; CHECK-NEXT:    ret void
+;
+  %a = load i32, i32* %Q
+  store volatile i32 %a, i32* %Q
+  ret void
+}
+
+; PR2599 - load -> store to same address.
+define void @test12({ i32, i32 }* %x) nounwind  {
+; CHECK-LABEL: @test12(
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr { i32, i32 }, { i32, i32 }* [[X:%.*]], i32 0, i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 0, [[TMP8]]
+; CHECK-NEXT:    store i32 [[TMP17]], i32* [[TMP7]], align 4
+; CHECK-NEXT:    ret void
+;
+  %tmp4 = getelementptr { i32, i32 }, { i32, i32 }* %x, i32 0, i32 0
+  %tmp5 = load i32, i32* %tmp4, align 4
+  %tmp7 = getelementptr { i32, i32 }, { i32, i32 }* %x, i32 0, i32 1
+  %tmp8 = load i32, i32* %tmp7, align 4
+  %tmp17 = sub i32 0, %tmp8
+  store i32 %tmp5, i32* %tmp4, align 4
+  store i32 %tmp17, i32* %tmp7, align 4
+  ret void
+}
+
+; Remove redundant store if loaded value is in another block.
+define i32 @test26(i1 %c, i32* %p) {
+; CHECK-LABEL: @test26(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %v = load i32, i32* %p, align 4
+  br i1 %c, label %bb1, label %bb2
+bb1:
+  br label %bb3
+bb2:
+  store i32 %v, i32* %p, align 4
+  br label %bb3
+bb3:
+  ret i32 0
+}
+
+; Remove redundant store if loaded value is in another block.
+define i32 @test27(i1 %c, i32* %p) {
+; CHECK-LABEL: @test27(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %v = load i32, i32* %p, align 4
+  br i1 %c, label %bb1, label %bb2
+bb1:
+  br label %bb3
+bb2:
+  br label %bb3
+bb3:
+  store i32 %v, i32* %p, align 4
+  ret i32 0
+}
+
+; Remove redundant store if loaded value is in another block inside a loop.
+define i32 @test31(i1 %c, i32* %p, i32 %i) {
+; CHECK-LABEL: @test31(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br i1 undef, label [[BB1]], label [[BB2:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %v = load i32, i32* %p, align 4
+  br label %bb1
+bb1:
+  store i32 %v, i32* %p, align 4
+  br i1 undef, label %bb1, label %bb2
+bb2:
+  ret i32 0
+}
+
+; Remove redundant store, which is in the lame loop as the load.
+define i32 @test33(i1 %c, i32* %p, i32 %i) {
+; CHECK-LABEL: @test33(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB2:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    call void @unknown_func()
+; CHECK-NEXT:    br i1 undef, label [[BB1]], label [[BB3:%.*]]
+; CHECK:       bb3:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  br label %bb1
+bb1:
+  %v = load i32, i32* %p, align 4
+  br label %bb2
+bb2:
+  store i32 %v, i32* %p, align 4
+  ; Might read and overwrite value at %p, but doesn't matter.
+  call void @unknown_func()
+  br i1 undef, label %bb1, label %bb3
+bb3:
+  ret i32 0
+}
+
+declare void @unkown_write(i32*)
+
+; We can't remove the "noop" store around an unkown write.
+define void @test43(i32* %Q) {
+; CHECK-LABEL: @test43(
+; CHECK-NEXT:    load
+; CHECK-NEXT:    call
+; CHECK-NEXT:    store
+; CHECK-NEXT:    ret void
+;
+  %a = load i32, i32* %Q
+  call void @unkown_write(i32* %Q)
+  store i32 %a, i32* %Q
+  ret void
+}
+
+; We CAN remove it when the unkown write comes AFTER.
+define void @test44(i32* %Q) {
+; CHECK-LABEL: @test44(
+; CHECK-NEXT:    call
+; CHECK-NEXT:    ret void
+  %a = load i32, i32* %Q
+  store i32 %a, i32* %Q
+  call void @unkown_write(i32* %Q)
+  ret void
+}
+
+define void @test45(i32* %Q) {
+; CHECK-LABEL: @test45(
+; CHECK-NEXT:    [[A:%.*]] = load
+; CHECK-NEXT:    store i32 [[A]]
+; CHECK-NEXT:    ret void
+  %a = load i32, i32* %Q
+  store i32 10, i32* %Q
+  store i32 %a, i32* %Q
+  ret void
+}