diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -87,6 +87,8 @@
 STATISTIC(NumCFGChecks, "Number of stores modified");
 STATISTIC(NumCFGTries, "Number of stores modified");
 STATISTIC(NumCFGSuccess, "Number of stores modified");
+STATISTIC(NumGetDomMemoryDefPassed,
+          "Number of times a valid candidate is returned from getDomMemoryDef");
 STATISTIC(NumDomMemDefChecks,
           "Number iterations check for reads in getDomMemoryDef");
 
@@ -116,6 +118,12 @@
     cl::desc("The maximum number of steps while walking upwards to find "
              "MemoryDefs that may be killed (default = 70)"));
 
+static cl::opt<unsigned> MemorySSAPartialStoreLimit(
+    "dse-memoryssa-partial-store-limit", cl::init(5), cl::Hidden,
+    cl::desc("The maximum number candidates that only partially overwrite the "
+             "killing MemoryDef to consider"
+             " (default = 5)"));
+
 static cl::opt<unsigned> MemorySSADefsPerBlockLimit(
     "dse-memoryssa-defs-per-block-limit", cl::init(5000), cl::Hidden,
     cl::desc("The number of MemoryDefs we consider as candidates to eliminated "
@@ -1464,12 +1472,12 @@
 // 2. Check that there are no reads between EarlierAccess and the StartDef by
 //    checking all uses starting at EarlierAccess and walking until we see
 //    StartDef.
-// 3. For each found EarlierDef, check that:
-//   1. There are no barrier instructions between EarlierDef and StartDef (like
+// 3. For each found CurrentDef, check that:
+//   1. There are no barrier instructions between CurrentDef and StartDef (like
 //       throws or stores with ordering constraints).
-//   2. StartDef is executed whenever EarlierDef is executed.
-//   3. StartDef completely overwrites EarlierDef.
-// 4. Erase EarlierDef from the function and MemorySSA.
+//   2. StartDef is executed whenever CurrentDef is executed.
+//   3. StartDef completely overwrites CurrentDef.
+// 4. Erase CurrentDef from the function and MemorySSA.
 
 // Returns true if \p M is an intrisnic that does not read or write memory.
 bool isNoopIntrinsic(MemoryUseOrDef *M) {
@@ -1801,26 +1809,29 @@
     return isRefSet(BatchAA.getModRefInfo(UseInst, DefLoc));
   }
 
-  // Find a MemoryDef writing to \p DefLoc and dominating \p Current, with no
-  // read access between them or on any other path to a function exit block if
-  // \p DefLoc is not accessible after the function returns. If there is no such
-  // MemoryDef, return None. The returned value may not (completely) overwrite
-  // \p DefLoc. Currently we bail out when we encounter an aliasing MemoryUse
-  // (read).
+  // Find a MemoryDef writing to \p DefLoc and dominating \p StartAccess, with
+  // no read access between them or on any other path to a function exit block
+  // if \p DefLoc is not accessible after the function returns. If there is no
+  // such MemoryDef, return None. The returned value may not (completely)
+  // overwrite \p DefLoc. Currently we bail out when we encounter an aliasing
+  // MemoryUse (read).
   Optional<MemoryAccess *>
-  getDomMemoryDef(MemoryDef *KillingDef, MemoryAccess *Current,
+  getDomMemoryDef(MemoryDef *KillingDef, MemoryAccess *StartAccess,
                   MemoryLocation DefLoc, const Value *DefUO, CheckCache &Cache,
-                  unsigned &ScanLimit, unsigned &WalkerStepLimit) {
+                  unsigned &ScanLimit, unsigned &WalkerStepLimit,
+                  bool IsMemTerm, unsigned &PartialLimit) {
     if (ScanLimit == 0 || WalkerStepLimit == 0) {
       LLVM_DEBUG(dbgs() << "\n    ...  hit scan limit\n");
       return None;
     }
 
-    MemoryAccess *StartAccess = Current;
+    MemoryAccess *Current = StartAccess;
+    Instruction *KillingI = KillingDef->getMemoryInst();
     bool StepAgain;
-    LLVM_DEBUG(dbgs() << "  trying to get dominating access for " << *Current
-                      << "\n");
-    // Find the next clobbering Mod access for DefLoc, starting at Current.
+    LLVM_DEBUG(dbgs() << "  trying to get dominating access for "
+                      << *StartAccess << "\n");
+
+    // Find the next clobbering Mod access for DefLoc, starting at StartAccess.
     do {
       StepAgain = false;
       // Reached TOP.
@@ -1839,12 +1850,86 @@
       if (isa<MemoryPhi>(Current))
         break;
 
-      // Check if we can skip EarlierDef for DSE.
-      MemoryDef *CurrentDef = dyn_cast<MemoryDef>(Current);
-      if (CurrentDef &&
-          canSkipDef(CurrentDef, !isInvisibleToCallerBeforeRet(DefUO))) {
+      // Below, check if CurrentDef is a valid candidate to be eliminated by
+      // KillingDef. If it is not, check the next candidate.
+      MemoryDef *CurrentDef = cast<MemoryDef>(Current);
+      Instruction *CurrentI = CurrentDef->getMemoryInst();
+
+      if (canSkipDef(CurrentDef, !isInvisibleToCallerBeforeRet(DefUO))) {
+        StepAgain = true;
+        Current = CurrentDef->getDefiningAccess();
+        continue;
+      }
+
+      // Before we try to remove anything, check for any extra throwing
+      // instructions that block us from DSEing
+      if (mayThrowBetween(KillingI, CurrentI, DefUO)) {
+        LLVM_DEBUG(dbgs() << "  ... skip, may throw!\n");
+        return None;
+      }
+
+      // Check for anything that looks like it will be a barrier to further
+      // removal
+      if (isDSEBarrier(DefUO, CurrentI)) {
+        LLVM_DEBUG(dbgs() << "  ... skip, barrier\n");
+        return None;
+      }
+
+      // If Current is known to be on path that reads DefLoc or is a read
+      // clobber, bail out, as the path is not profitable. We skip this check
+      // for intrinsic calls, because the code knows how to handle memcpy
+      // intrinsics.
+      if (!isa<IntrinsicInst>(CurrentI) &&
+          (Cache.KnownReads.contains(Current) ||
+           isReadClobber(DefLoc, CurrentI))) {
+        Cache.KnownReads.insert(Current);
+        return None;
+      }
+
+      // If Current cannot be analyzed or is not removable, check the next
+      // candidate.
+      if (!hasAnalyzableMemoryWrite(CurrentI, TLI) || !isRemovable(CurrentI)) {
         StepAgain = true;
         Current = CurrentDef->getDefiningAccess();
+        continue;
+      }
+
+      auto CurrentLoc = getLocForWriteEx(CurrentI);
+      if (!CurrentLoc)
+        break;
+
+      if (IsMemTerm) {
+        // If the killing def is a memory terminator (e.g. lifetime.end), check
+        // the next candidate if the current Current does not write the same
+        // underlying object as the terminator.
+        const Value *NIUnd = getUnderlyingObject(CurrentLoc->Ptr);
+        if (DefUO != NIUnd) {
+          StepAgain = true;
+          Current = CurrentDef->getDefiningAccess();
+        }
+        continue;
+      } else {
+        int64_t InstWriteOffset, DepWriteOffset;
+        auto OR = isOverwrite(DefLoc, *CurrentLoc, DL, TLI, DepWriteOffset,
+                              InstWriteOffset, BatchAA, &F);
+        // If Current does not write to the same object as KillingDef, check
+        // the next candidate.
+        if (OR == OW_Unknown) {
+          StepAgain = true;
+          Current = CurrentDef->getDefiningAccess();
+        } else if (OR == OW_MaybePartial) {
+          // If KillingDef only partially overwrites Current, check the next
+          // candidate if the partial step limit is exceeded. This aggressively
+          // limits the number of candidates for partial store elimination,
+          // which are less likely to be removable in the end.
+          if (PartialLimit <= 1) {
+            StepAgain = true;
+            Current = CurrentDef->getDefiningAccess();
+            WalkerStepLimit -= 1;
+            continue;
+          }
+          PartialLimit -= 1;
+        }
       }
     } while (StepAgain);
 
@@ -2260,10 +2345,14 @@
 
     unsigned ScanLimit = MemorySSAScanLimit;
     unsigned WalkerStepLimit = MemorySSAUpwardsStepLimit;
+    unsigned PartialLimit = MemorySSAPartialStoreLimit;
     // Worklist of MemoryAccesses that may be killed by KillingDef.
     SetVector<MemoryAccess *> ToCheck;
     ToCheck.insert(KillingDef->getDefiningAccess());
 
+    if (!SILocUnd)
+      continue;
+    bool IsMemTerm = State.isMemTerminatorInst(SI);
     DSEState::CheckCache Cache;
     // Check if MemoryAccesses in the worklist are killed by KillingDef.
     for (unsigned I = 0; I < ToCheck.size(); I++) {
@@ -2271,9 +2360,9 @@
       if (State.SkipStores.count(Current))
         continue;
 
-      Optional<MemoryAccess *> Next =
-          State.getDomMemoryDef(KillingDef, Current, SILoc, SILocUnd, Cache,
-                                ScanLimit, WalkerStepLimit);
+      Optional<MemoryAccess *> Next = State.getDomMemoryDef(
+          KillingDef, Current, SILoc, SILocUnd, Cache, ScanLimit,
+          WalkerStepLimit, IsMemTerm, PartialLimit);
 
       if (!Next) {
         LLVM_DEBUG(dbgs() << "  finished walk\n");
@@ -2301,41 +2390,17 @@
       MemoryDef *NextDef = dyn_cast<MemoryDef>(EarlierAccess);
       Instruction *NI = NextDef->getMemoryInst();
       LLVM_DEBUG(dbgs() << " (" << *NI << ")\n");
-
-      // Before we try to remove anything, check for any extra throwing
-      // instructions that block us from DSEing
-      if (State.mayThrowBetween(SI, NI, SILocUnd)) {
-        LLVM_DEBUG(dbgs() << "  ... skip, may throw!\n");
-        break;
-      }
-
-      // Check for anything that looks like it will be a barrier to further
-      // removal
-      if (State.isDSEBarrier(SILocUnd, NI)) {
-        LLVM_DEBUG(dbgs() << "  ... skip, barrier\n");
-        continue;
-      }
-
       ToCheck.insert(NextDef->getDefiningAccess());
-
-      if (!hasAnalyzableMemoryWrite(NI, TLI)) {
-        LLVM_DEBUG(dbgs() << "  ... skip, cannot analyze def\n");
-        continue;
-      }
-
-      if (!isRemovable(NI)) {
-        LLVM_DEBUG(dbgs() << "  ... skip, cannot remove def\n");
-        continue;
-      }
+      NumGetDomMemoryDefPassed++;
 
       if (!DebugCounter::shouldExecute(MemorySSACounter))
         continue;
 
       MemoryLocation NILoc = *State.getLocForWriteEx(NI);
 
-      if (State.isMemTerminatorInst(SI)) {
+      if (IsMemTerm) {
         const Value *NIUnd = getUnderlyingObject(NILoc.Ptr);
-        if (!SILocUnd || SILocUnd != NIUnd)
+        if (SILocUnd != NIUnd)
           continue;
         LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: " << *NI
                           << "\n  KILLER: " << *SI << '\n');
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -dse -enable-dse-memoryssa -enable-dse-partial-store-merging=false < %s | FileCheck %s
+; RUN: opt -S -dse -enable-dse-memoryssa -enable-dse-partial-store-merging=false < %s | FileCheck --check-prefixes=CHECK,DEFAULT-LIMIT %s
+; RUN: opt -S -dse -enable-dse-memoryssa -enable-dse-partial-store-merging=false -dse-memoryssa-partial-store-limit=10 < %s | FileCheck --check-prefixes=CHECK,LARGER-LIMIT %s
 target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64le-unknown-linux"
 
@@ -209,22 +210,43 @@
 declare void @goFunc(%struct.foostruct*)
 declare i32 @fa(i8*, i8**, i32, i8, i8*)
 
+; We miss this case, because of an aggressive limit of partial overlap analysis.
+; With a larger partial store limit, we remove the memset.
 define void @test4()  {
-; CHECK-LABEL: @test4(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[BANG:%.*]] = alloca [[STRUCT_FOOSTRUCT:%.*]], align 8
-; CHECK-NEXT:    [[V2:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 0
-; CHECK-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V2]], align 8
-; CHECK-NEXT:    [[V3:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 1
-; CHECK-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V3]], align 8
-; CHECK-NEXT:    [[V4:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 2
-; CHECK-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V4]], align 8
-; CHECK-NEXT:    [[V5:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 3
-; CHECK-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V5]], align 8
-; CHECK-NEXT:    [[V6:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 4
-; CHECK-NEXT:    store void (i8*, i32, i32)* null, void (i8*, i32, i32)** [[V6]], align 8
-; CHECK-NEXT:    call void @goFunc(%struct.foostruct* [[BANG]])
-; CHECK-NEXT:    ret void
+; DEFAULT-LIMIT-LABEL: @test4(
+; DEFAULT-LIMIT-NEXT:  entry:
+; DEFAULT-LIMIT-NEXT:    [[BANG:%.*]] = alloca [[STRUCT_FOOSTRUCT:%.*]], align 8
+; DEFAULT-LIMIT-NEXT:    [[V1:%.*]] = bitcast %struct.foostruct* [[BANG]] to i8*
+; DEFAULT-LIMIT-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[V1]], i64 32
+; DEFAULT-LIMIT-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP0]], i8 0, i64 8, i1 false)
+; DEFAULT-LIMIT-NEXT:    [[V2:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 0
+; DEFAULT-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V2]], align 8
+; DEFAULT-LIMIT-NEXT:    [[V3:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 1
+; DEFAULT-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V3]], align 8
+; DEFAULT-LIMIT-NEXT:    [[V4:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 2
+; DEFAULT-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V4]], align 8
+; DEFAULT-LIMIT-NEXT:    [[V5:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 3
+; DEFAULT-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V5]], align 8
+; DEFAULT-LIMIT-NEXT:    [[V6:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 4
+; DEFAULT-LIMIT-NEXT:    store void (i8*, i32, i32)* null, void (i8*, i32, i32)** [[V6]], align 8
+; DEFAULT-LIMIT-NEXT:    call void @goFunc(%struct.foostruct* [[BANG]])
+; DEFAULT-LIMIT-NEXT:    ret void
+;
+; LARGER-LIMIT-LABEL: @test4(
+; LARGER-LIMIT-NEXT:  entry:
+; LARGER-LIMIT-NEXT:    [[BANG:%.*]] = alloca [[STRUCT_FOOSTRUCT:%.*]], align 8
+; LARGER-LIMIT-NEXT:    [[V2:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 0
+; LARGER-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V2]], align 8
+; LARGER-LIMIT-NEXT:    [[V3:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 1
+; LARGER-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V3]], align 8
+; LARGER-LIMIT-NEXT:    [[V4:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 2
+; LARGER-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V4]], align 8
+; LARGER-LIMIT-NEXT:    [[V5:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 3
+; LARGER-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V5]], align 8
+; LARGER-LIMIT-NEXT:    [[V6:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 4
+; LARGER-LIMIT-NEXT:    store void (i8*, i32, i32)* null, void (i8*, i32, i32)** [[V6]], align 8
+; LARGER-LIMIT-NEXT:    call void @goFunc(%struct.foostruct* [[BANG]])
+; LARGER-LIMIT-NEXT:    ret void
 ;
 entry:
 
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/debug-counter.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/debug-counter.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/debug-counter.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/debug-counter.ll
@@ -1,7 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 
-; XFAIL: *
-
 ; REQUIRES: asserts
 
 ; Eliminates store to %R in the entry block.
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll
@@ -1,7 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 
-; XFAIL: *
-
 ; RUN: opt < %s -basic-aa -dse -enable-dse-memoryssa -S | FileCheck --check-prefix=NO-LIMIT %s
 ; RUN: opt < %s -basic-aa -dse -enable-dse-memoryssa -dse-memoryssa-scanlimit=0 -S | FileCheck --check-prefix=LIMIT-0 %s
 ; RUN: opt < %s -basic-aa -dse -enable-dse-memoryssa -dse-memoryssa-scanlimit=2 -S | FileCheck --check-prefix=LIMIT-2 %s
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -dse -enable-dse-memoryssa %s -S | FileCheck %s
+; RUN: opt -dse -enable-dse-memoryssa %s -S | FileCheck --check-prefixes=CHECK,DEFAULT-LIMIT %s
+; RUN: opt -dse -enable-dse-memoryssa -dse-memoryssa-partial-store-limit=10 %s -S | FileCheck --check-prefixes=CHECK,LARGER-LIMIT %s
 
 
 %struct.ham = type { [3 x double], [3 x double]}
@@ -7,28 +8,55 @@
 declare void @may_throw()
 declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg)
 
+; We miss this case, because of an aggressive limit of partial overlap analysis.
+; With a larger partial store limit, we remove the memset.
 define void @overlap1(%struct.ham* %arg, i1 %cond) {
-; CHECK-LABEL: @overlap1(
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds [[STRUCT_HAM:%.*]], %struct.ham* [[ARG:%.*]], i64 0, i32 0, i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 2
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 1
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i32 0
-; CHECK-NEXT:    br i1 [[COND:%.*]], label [[BB7:%.*]], label [[BB8:%.*]]
-; CHECK:       bb7:
-; CHECK-NEXT:    br label [[BB9:%.*]]
-; CHECK:       bb8:
-; CHECK-NEXT:    br label [[BB9]]
-; CHECK:       bb9:
-; CHECK-NEXT:    store double 1.000000e+00, double* [[TMP2]], align 8
-; CHECK-NEXT:    store double 2.000000e+00, double* [[TMP1]], align 8
-; CHECK-NEXT:    store double 3.000000e+00, double* [[TMP]], align 8
-; CHECK-NEXT:    store double 4.000000e+00, double* [[TMP5]], align 8
-; CHECK-NEXT:    store double 5.000000e+00, double* [[TMP4]], align 8
-; CHECK-NEXT:    store double 6.000000e+00, double* [[TMP3]], align 8
-; CHECK-NEXT:    ret void
+; DEFAULT-LIMIT-LABEL: @overlap1(
+; DEFAULT-LIMIT-NEXT:  bb:
+; DEFAULT-LIMIT-NEXT:    [[TMP:%.*]] = getelementptr inbounds [[STRUCT_HAM:%.*]], %struct.ham* [[ARG:%.*]], i64 0, i32 0, i64 2
+; DEFAULT-LIMIT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 1
+; DEFAULT-LIMIT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 0
+; DEFAULT-LIMIT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 2
+; DEFAULT-LIMIT-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 1
+; DEFAULT-LIMIT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i32 0
+; DEFAULT-LIMIT-NEXT:    [[TMP6:%.*]] = bitcast double* [[TMP2]] to i8*
+; DEFAULT-LIMIT-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i64 32
+; DEFAULT-LIMIT-NEXT:    call void @llvm.memset.p0i8.i64(i8* nonnull align 8 dereferenceable(48) [[TMP0]], i8 0, i64 16, i1 false)
+; DEFAULT-LIMIT-NEXT:    br i1 [[COND:%.*]], label [[BB7:%.*]], label [[BB8:%.*]]
+; DEFAULT-LIMIT:       bb7:
+; DEFAULT-LIMIT-NEXT:    br label [[BB9:%.*]]
+; DEFAULT-LIMIT:       bb8:
+; DEFAULT-LIMIT-NEXT:    br label [[BB9]]
+; DEFAULT-LIMIT:       bb9:
+; DEFAULT-LIMIT-NEXT:    store double 1.000000e+00, double* [[TMP2]], align 8
+; DEFAULT-LIMIT-NEXT:    store double 2.000000e+00, double* [[TMP1]], align 8
+; DEFAULT-LIMIT-NEXT:    store double 3.000000e+00, double* [[TMP]], align 8
+; DEFAULT-LIMIT-NEXT:    store double 4.000000e+00, double* [[TMP5]], align 8
+; DEFAULT-LIMIT-NEXT:    store double 5.000000e+00, double* [[TMP4]], align 8
+; DEFAULT-LIMIT-NEXT:    store double 6.000000e+00, double* [[TMP3]], align 8
+; DEFAULT-LIMIT-NEXT:    ret void
+;
+; LARGER-LIMIT-LABEL: @overlap1(
+; LARGER-LIMIT-NEXT:  bb:
+; LARGER-LIMIT-NEXT:    [[TMP:%.*]] = getelementptr inbounds [[STRUCT_HAM:%.*]], %struct.ham* [[ARG:%.*]], i64 0, i32 0, i64 2
+; LARGER-LIMIT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 1
+; LARGER-LIMIT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 0
+; LARGER-LIMIT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 2
+; LARGER-LIMIT-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 1
+; LARGER-LIMIT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i32 0
+; LARGER-LIMIT-NEXT:    br i1 [[COND:%.*]], label [[BB7:%.*]], label [[BB8:%.*]]
+; LARGER-LIMIT:       bb7:
+; LARGER-LIMIT-NEXT:    br label [[BB9:%.*]]
+; LARGER-LIMIT:       bb8:
+; LARGER-LIMIT-NEXT:    br label [[BB9]]
+; LARGER-LIMIT:       bb9:
+; LARGER-LIMIT-NEXT:    store double 1.000000e+00, double* [[TMP2]], align 8
+; LARGER-LIMIT-NEXT:    store double 2.000000e+00, double* [[TMP1]], align 8
+; LARGER-LIMIT-NEXT:    store double 3.000000e+00, double* [[TMP]], align 8
+; LARGER-LIMIT-NEXT:    store double 4.000000e+00, double* [[TMP5]], align 8
+; LARGER-LIMIT-NEXT:    store double 5.000000e+00, double* [[TMP4]], align 8
+; LARGER-LIMIT-NEXT:    store double 6.000000e+00, double* [[TMP3]], align 8
+; LARGER-LIMIT-NEXT:    ret void
 ;
 bb:
   %tmp = getelementptr inbounds %struct.ham, %struct.ham* %arg, i64 0, i32 0, i64 2
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll
@@ -477,10 +477,8 @@
   ret i32 0
 }
 
-; TODO
-; We can remove redundant store, as noalias %p guarantees that the function does
-; only access it via %p. This also holds for the call to unknown_func even though
-; it could unwind
+; We cannot remove any stores, because @unknown_func may unwind and the caller
+; may read %p while unwinding.
 define void @test34(i32* noalias %p) {
 ; CHECK-LABEL: @test34(
 ; CHECK-NEXT:    store i32 1, i32* [[P:%.*]], align 4
@@ -636,9 +634,10 @@
   ret void
 }
 
-; I think this case is currently handled incorrectly by memdeps dse
-; throwing should leave store i32 1, not remove from the free.
 declare void @free(i8* nocapture)
+
+; We cannot remove `store i32 1, i32* %p`, because @unknown_func may unwind
+; and the caller may read %p while unwinding.
 define void @test41(i32* noalias %P) {
 ; CHECK-LABEL: @test41(
 ; CHECK-NEXT:    [[P2:%.*]] = bitcast i32* [[P:%.*]] to i8*