diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -87,6 +87,8 @@
 STATISTIC(NumCFGChecks, "Number of stores modified");
 STATISTIC(NumCFGTries, "Number of stores modified");
 STATISTIC(NumCFGSuccess, "Number of stores modified");
+STATISTIC(NumGetDomMemoryDefPassed,
+          "Number of times a valid candidate is returned from getDomMemoryDef");
 STATISTIC(NumDomMemDefChecks,
           "Number iterations check for reads in getDomMemoryDef");
 
@@ -116,6 +118,12 @@
     cl::desc("The maximum number of steps while walking upwards to find "
              "MemoryDefs that may be killed (default = 70)"));
 
+static cl::opt<unsigned> MemorySSAPartialStoreLimit(
+    "dse-memoryssa-partial-store-limit", cl::init(5), cl::Hidden,
+    cl::desc("The maximum number candidates that only partially overwrite the "
+             "killing MemoryDef to consider"
+             " (default = 5)"));
+
 static cl::opt<unsigned> MemorySSADefsPerBlockLimit(
     "dse-memoryssa-defs-per-block-limit", cl::init(5000), cl::Hidden,
     cl::desc("The number of MemoryDefs we consider as candidates to eliminated "
@@ -1809,7 +1817,8 @@
   Optional<MemoryAccess *>
   getDomMemoryDef(MemoryDef *KillingDef, MemoryAccess *Current,
                   MemoryLocation DefLoc, const Value *DefUO, CheckCache &Cache,
-                  unsigned &ScanLimit, unsigned &WalkerStepLimit) {
+                  unsigned &ScanLimit, unsigned &WalkerStepLimit,
+                  bool IsMemTerm, unsigned &PartialLimit) {
     if (ScanLimit == 0 || WalkerStepLimit == 0) {
       LLVM_DEBUG(dbgs() << "\n    ...  hit scan limit\n");
       return None;
@@ -1817,9 +1826,11 @@
 
     MemoryAccess *DomAccess;
     MemoryAccess *StartAccess = Current;
+    Instruction *KillingI = KillingDef->getMemoryInst();
     bool StepAgain;
     LLVM_DEBUG(dbgs() << "  trying to get dominating access for " << *Current
                       << "\n");
+
     // Find the next clobbering Mod access for DefLoc, starting at Current.
     do {
       StepAgain = false;
@@ -1853,13 +1864,86 @@
       if (isa<MemoryPhi>(DomAccess))
         break;
 
-      // Check if we can skip DomDef for DSE.
-      MemoryDef *DomDef = dyn_cast<MemoryDef>(DomAccess);
-      if (DomDef && canSkipDef(DomDef, !isInvisibleToCallerBeforeRet(DefUO))) {
+      // Below, check if DomDef is a valid candidate to be eliminated by
+      // KillingDef. If it is not, check the next candidate.
+      MemoryDef *DomDef = cast<MemoryDef>(DomAccess);
+      Instruction *DomI = DomDef->getMemoryInst();
+
+      if (canSkipDef(DomDef, !isInvisibleToCallerBeforeRet(DefUO))) {
+        StepAgain = true;
+        Current = DomDef->getDefiningAccess();
+        continue;
+      }
+
+      // Before we try to remove anything, check for any extra throwing
+      // instructions that block us from DSEing
+      if (mayThrowBetween(KillingI, DomI, DefUO)) {
+        LLVM_DEBUG(dbgs() << "  ... skip, may throw!\n");
+        return None;
+      }
+
+      // Check for anything that looks like it will be a barrier to further
+      // removal
+      if (isDSEBarrier(DefUO, DomI)) {
+        LLVM_DEBUG(dbgs() << "  ... skip, barrier\n");
+        return None;
+      }
+
+      // If DomAccess is known to be on path that reads DefLoc or is a read
+      // clobber, bail out, as the path is not profitable. We skip this check
+      // for intrinsic calls, because the code knows how to handle memcpy
+      // intrinsics.
+      if (!isa<IntrinsicInst>(DomI) && (Cache.KnownReads.contains(DomAccess) ||
+                                        isReadClobber(DefLoc, DomI))) {
+        Cache.KnownReads.insert(DomAccess);
+        return None;
+      }
+
+      // If DomAccess cannot be analyzed or is not removable, check the next
+      // candidate.
+      if (!hasAnalyzableMemoryWrite(DomI, TLI) || !isRemovable(DomI)) {
         StepAgain = true;
         Current = DomDef->getDefiningAccess();
+        continue;
       }
 
+      auto DomLoc = getLocForWriteEx(DomI);
+      if (!DomLoc)
+        break;
+
+      if (IsMemTerm) {
+        // If the killing def is a memory terminator (e.g. lifetime.end), check
+        // the next candidate if the current DomAccess does not write the same
+        // underlying object as the terminator.
+        const Value *NIUnd = getUnderlyingObject(DomLoc->Ptr);
+        if (DefUO != NIUnd) {
+          StepAgain = true;
+          Current = DomDef->getDefiningAccess();
+        }
+        continue;
+      } else {
+        int64_t InstWriteOffset, DepWriteOffset;
+        auto OR = isOverwrite(DefLoc, *DomLoc, DL, TLI, DepWriteOffset,
+                              InstWriteOffset, BatchAA, &F);
+        // If DomAccess does not write to the same object as KillingDef, check
+        // the next candidate.
+        if (OR == OW_Unknown) {
+          StepAgain = true;
+          Current = DomDef->getDefiningAccess();
+        } else if (OR == OW_MaybePartial) {
+          // If KillingDef only partially overwrites DomAccess, check the next
+          // candidate if the partial step limit is exceeded. This aggressively
+          // limits the number of candidates for partial store elimination,
+          // which are less likely to be removable in the end.
+          if (PartialLimit <= 1) {
+            StepAgain = true;
+            Current = DomDef->getDefiningAccess();
+            WalkerStepLimit -= 1;
+            continue;
+          }
+          PartialLimit -= 1;
+        }
+      }
     } while (StepAgain);
 
     // Accesses to objects accessible after the function returns can only be
@@ -1867,7 +1951,7 @@
     // the blocks with killing (=completely overwriting MemoryDefs) and check if
     // they cover all paths from DomAccess to any function exit.
     SmallPtrSet<Instruction *, 16> KillingDefs;
-    KillingDefs.insert(KillingDef->getMemoryInst());
+    KillingDefs.insert(KillingI);
     Instruction *DomMemInst = isa<MemoryDef>(DomAccess)
                                   ? cast<MemoryDef>(DomAccess)->getMemoryInst()
                                   : nullptr;
@@ -2278,10 +2362,14 @@
 
     unsigned ScanLimit = MemorySSAScanLimit;
     unsigned WalkerStepLimit = MemorySSAUpwardsStepLimit;
+    unsigned PartialLimit = MemorySSAPartialStoreLimit;
     // Worklist of MemoryAccesses that may be killed by KillingDef.
     SetVector<MemoryAccess *> ToCheck;
     ToCheck.insert(KillingDef->getDefiningAccess());
 
+    if (!SILocUnd)
+      continue;
+    bool IsMemTerm = State.isMemTerminatorInst(SI);
     DSEState::CheckCache Cache;
     // Check if MemoryAccesses in the worklist are killed by KillingDef.
     for (unsigned I = 0; I < ToCheck.size(); I++) {
@@ -2289,9 +2377,9 @@
       if (State.SkipStores.count(Current))
         continue;
 
-      Optional<MemoryAccess *> Next =
-          State.getDomMemoryDef(KillingDef, Current, SILoc, SILocUnd, Cache,
-                                ScanLimit, WalkerStepLimit);
+      Optional<MemoryAccess *> Next = State.getDomMemoryDef(
+          KillingDef, Current, SILoc, SILocUnd, Cache, ScanLimit,
+          WalkerStepLimit, IsMemTerm, PartialLimit);
 
       if (!Next) {
         LLVM_DEBUG(dbgs() << "  finished walk\n");
@@ -2319,41 +2407,17 @@
       MemoryDef *NextDef = dyn_cast<MemoryDef>(DomAccess);
       Instruction *NI = NextDef->getMemoryInst();
       LLVM_DEBUG(dbgs() << " (" << *NI << ")\n");
-
-      // Before we try to remove anything, check for any extra throwing
-      // instructions that block us from DSEing
-      if (State.mayThrowBetween(SI, NI, SILocUnd)) {
-        LLVM_DEBUG(dbgs() << "  ... skip, may throw!\n");
-        break;
-      }
-
-      // Check for anything that looks like it will be a barrier to further
-      // removal
-      if (State.isDSEBarrier(SILocUnd, NI)) {
-        LLVM_DEBUG(dbgs() << "  ... skip, barrier\n");
-        continue;
-      }
-
       ToCheck.insert(NextDef->getDefiningAccess());
-
-      if (!hasAnalyzableMemoryWrite(NI, TLI)) {
-        LLVM_DEBUG(dbgs() << "  ... skip, cannot analyze def\n");
-        continue;
-      }
-
-      if (!isRemovable(NI)) {
-        LLVM_DEBUG(dbgs() << "  ... skip, cannot remove def\n");
-        continue;
-      }
+      NumGetDomMemoryDefPassed++;
 
       if (!DebugCounter::shouldExecute(MemorySSACounter))
         continue;
 
       MemoryLocation NILoc = *State.getLocForWriteEx(NI);
 
-      if (State.isMemTerminatorInst(SI)) {
+      if (IsMemTerm) {
         const Value *NIUnd = getUnderlyingObject(NILoc.Ptr);
-        if (!SILocUnd || SILocUnd != NIUnd)
+        if (SILocUnd != NIUnd)
           continue;
         LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: " << *NI
                           << "\n  KILLER: " << *SI << '\n');
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -dse -enable-dse-memoryssa -enable-dse-partial-store-merging=false < %s | FileCheck %s
+; RUN: opt -S -dse -enable-dse-memoryssa -enable-dse-partial-store-merging=false -dse-memoryssa-partial-store-limit=10 < %s | FileCheck --check-prefix=LARGER-LIMIT %s
 target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64le-unknown-linux"
 
@@ -34,6 +35,34 @@
 ; CHECK-NEXT:    store float [[MUL_I_I_I]], float* [[_M_VALUE_IMAGP_I_I]], align 4
 ; CHECK-NEXT:    ret void
 ;
+; LARGER-LIMIT-LABEL: @_Z4testSt7complexIfE(
+; LARGER-LIMIT-NEXT:  entry:
+; LARGER-LIMIT-NEXT:    [[REF_TMP:%.*]] = alloca i64, align 8
+; LARGER-LIMIT-NEXT:    [[TMPCAST:%.*]] = bitcast i64* [[REF_TMP]] to %"struct.std::complex"*
+; LARGER-LIMIT-NEXT:    [[C_SROA_0_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[C_COERCE:%.*]], 32
+; LARGER-LIMIT-NEXT:    [[C_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[C_SROA_0_0_EXTRACT_SHIFT]] to i32
+; LARGER-LIMIT-NEXT:    [[TMP0:%.*]] = bitcast i32 [[C_SROA_0_0_EXTRACT_TRUNC]] to float
+; LARGER-LIMIT-NEXT:    [[C_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[C_COERCE]] to i32
+; LARGER-LIMIT-NEXT:    [[TMP1:%.*]] = bitcast i32 [[C_SROA_2_0_EXTRACT_TRUNC]] to float
+; LARGER-LIMIT-NEXT:    call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret [[TMPCAST]], i64 [[C_COERCE]])
+; LARGER-LIMIT-NEXT:    [[TMP2:%.*]] = load i64, i64* [[REF_TMP]], align 8
+; LARGER-LIMIT-NEXT:    [[_M_VALUE_REALP_I_I:%.*]] = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* [[AGG_RESULT:%.*]], i64 0, i32 0, i32 0
+; LARGER-LIMIT-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], 32
+; LARGER-LIMIT-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; LARGER-LIMIT-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP4]] to float
+; LARGER-LIMIT-NEXT:    [[_M_VALUE_IMAGP_I_I:%.*]] = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* [[AGG_RESULT]], i64 0, i32 0, i32 1
+; LARGER-LIMIT-NEXT:    [[TMP6:%.*]] = trunc i64 [[TMP2]] to i32
+; LARGER-LIMIT-NEXT:    [[TMP7:%.*]] = bitcast i32 [[TMP6]] to float
+; LARGER-LIMIT-NEXT:    [[MUL_AD_I_I:%.*]] = fmul fast float [[TMP5]], [[TMP1]]
+; LARGER-LIMIT-NEXT:    [[MUL_BC_I_I:%.*]] = fmul fast float [[TMP7]], [[TMP0]]
+; LARGER-LIMIT-NEXT:    [[MUL_I_I_I:%.*]] = fadd fast float [[MUL_AD_I_I]], [[MUL_BC_I_I]]
+; LARGER-LIMIT-NEXT:    [[MUL_AC_I_I:%.*]] = fmul fast float [[TMP5]], [[TMP0]]
+; LARGER-LIMIT-NEXT:    [[MUL_BD_I_I:%.*]] = fmul fast float [[TMP7]], [[TMP1]]
+; LARGER-LIMIT-NEXT:    [[MUL_R_I_I:%.*]] = fsub fast float [[MUL_AC_I_I]], [[MUL_BD_I_I]]
+; LARGER-LIMIT-NEXT:    store float [[MUL_R_I_I]], float* [[_M_VALUE_REALP_I_I]], align 4
+; LARGER-LIMIT-NEXT:    store float [[MUL_I_I_I]], float* [[_M_VALUE_IMAGP_I_I]], align 4
+; LARGER-LIMIT-NEXT:    ret void
+;
 entry:
 
   %ref.tmp = alloca i64, align 8
@@ -81,6 +110,18 @@
 ; CHECK-NEXT:    store i16 2020, i16* [[WPTRP]], align 1
 ; CHECK-NEXT:    ret void
 ;
+; LARGER-LIMIT-LABEL: @test1(
+; LARGER-LIMIT-NEXT:  entry:
+; LARGER-LIMIT-NEXT:    [[BPTR:%.*]] = bitcast i32* [[PTR:%.*]] to i8*
+; LARGER-LIMIT-NEXT:    [[WPTR:%.*]] = bitcast i32* [[PTR]] to i16*
+; LARGER-LIMIT-NEXT:    store i16 -30062, i16* [[WPTR]], align 2
+; LARGER-LIMIT-NEXT:    [[BPTR3:%.*]] = getelementptr inbounds i8, i8* [[BPTR]], i64 3
+; LARGER-LIMIT-NEXT:    store i8 47, i8* [[BPTR3]], align 1
+; LARGER-LIMIT-NEXT:    [[BPTR1:%.*]] = getelementptr inbounds i8, i8* [[BPTR]], i64 1
+; LARGER-LIMIT-NEXT:    [[WPTRP:%.*]] = bitcast i8* [[BPTR1]] to i16*
+; LARGER-LIMIT-NEXT:    store i16 2020, i16* [[WPTRP]], align 1
+; LARGER-LIMIT-NEXT:    ret void
+;
 entry:
 
   store i32 5, i32* %ptr
@@ -120,6 +161,25 @@
 ; CHECK-NEXT:    store i16 5656, i16* [[WPTR3]], align 1
 ; CHECK-NEXT:    ret void
 ;
+; LARGER-LIMIT-LABEL: @test2(
+; LARGER-LIMIT-NEXT:  entry:
+; LARGER-LIMIT-NEXT:    [[BPTR:%.*]] = bitcast i32* [[PTR:%.*]] to i8*
+; LARGER-LIMIT-NEXT:    [[BPTRM1:%.*]] = getelementptr inbounds i8, i8* [[BPTR]], i64 -1
+; LARGER-LIMIT-NEXT:    [[BPTR1:%.*]] = getelementptr inbounds i8, i8* [[BPTR]], i64 1
+; LARGER-LIMIT-NEXT:    [[BPTR2:%.*]] = getelementptr inbounds i8, i8* [[BPTR]], i64 2
+; LARGER-LIMIT-NEXT:    [[BPTR3:%.*]] = getelementptr inbounds i8, i8* [[BPTR]], i64 3
+; LARGER-LIMIT-NEXT:    [[WPTR:%.*]] = bitcast i8* [[BPTR]] to i16*
+; LARGER-LIMIT-NEXT:    [[WPTRM1:%.*]] = bitcast i8* [[BPTRM1]] to i16*
+; LARGER-LIMIT-NEXT:    [[WPTR1:%.*]] = bitcast i8* [[BPTR1]] to i16*
+; LARGER-LIMIT-NEXT:    [[WPTR2:%.*]] = bitcast i8* [[BPTR2]] to i16*
+; LARGER-LIMIT-NEXT:    [[WPTR3:%.*]] = bitcast i8* [[BPTR3]] to i16*
+; LARGER-LIMIT-NEXT:    store i16 1456, i16* [[WPTRM1]], align 1
+; LARGER-LIMIT-NEXT:    store i16 1346, i16* [[WPTR]], align 1
+; LARGER-LIMIT-NEXT:    store i16 1756, i16* [[WPTR1]], align 1
+; LARGER-LIMIT-NEXT:    store i16 1126, i16* [[WPTR2]], align 1
+; LARGER-LIMIT-NEXT:    store i16 5656, i16* [[WPTR3]], align 1
+; LARGER-LIMIT-NEXT:    ret void
+;
 entry:
 
   store i32 5, i32* %ptr
@@ -170,6 +230,27 @@
 ; CHECK-NEXT:    store i16 5656, i16* [[WPTR3]], align 1
 ; CHECK-NEXT:    ret i8 [[V]]
 ;
+; LARGER-LIMIT-LABEL: @test3(
+; LARGER-LIMIT-NEXT:  entry:
+; LARGER-LIMIT-NEXT:    store i32 5, i32* [[PTR:%.*]], align 4
+; LARGER-LIMIT-NEXT:    [[BPTR:%.*]] = bitcast i32* [[PTR]] to i8*
+; LARGER-LIMIT-NEXT:    [[BPTRM1:%.*]] = getelementptr inbounds i8, i8* [[BPTR]], i64 -1
+; LARGER-LIMIT-NEXT:    [[BPTR1:%.*]] = getelementptr inbounds i8, i8* [[BPTR]], i64 1
+; LARGER-LIMIT-NEXT:    [[BPTR2:%.*]] = getelementptr inbounds i8, i8* [[BPTR]], i64 2
+; LARGER-LIMIT-NEXT:    [[BPTR3:%.*]] = getelementptr inbounds i8, i8* [[BPTR]], i64 3
+; LARGER-LIMIT-NEXT:    [[WPTR:%.*]] = bitcast i8* [[BPTR]] to i16*
+; LARGER-LIMIT-NEXT:    [[WPTRM1:%.*]] = bitcast i8* [[BPTRM1]] to i16*
+; LARGER-LIMIT-NEXT:    [[WPTR1:%.*]] = bitcast i8* [[BPTR1]] to i16*
+; LARGER-LIMIT-NEXT:    [[WPTR2:%.*]] = bitcast i8* [[BPTR2]] to i16*
+; LARGER-LIMIT-NEXT:    [[WPTR3:%.*]] = bitcast i8* [[BPTR3]] to i16*
+; LARGER-LIMIT-NEXT:    [[V:%.*]] = load i8, i8* [[BPTR]], align 1
+; LARGER-LIMIT-NEXT:    store i16 1456, i16* [[WPTRM1]], align 1
+; LARGER-LIMIT-NEXT:    store i16 1346, i16* [[WPTR]], align 1
+; LARGER-LIMIT-NEXT:    store i16 1756, i16* [[WPTR1]], align 1
+; LARGER-LIMIT-NEXT:    store i16 1126, i16* [[WPTR2]], align 1
+; LARGER-LIMIT-NEXT:    store i16 5656, i16* [[WPTR3]], align 1
+; LARGER-LIMIT-NEXT:    ret i8 [[V]]
+;
 entry:
 
   store i32 5, i32* %ptr
@@ -209,10 +290,15 @@
 declare void @goFunc(%struct.foostruct*)
 declare i32 @fa(i8*, i8**, i32, i8, i8*)
 
+; We miss this case, because of aggressive an aggressive limit of partial
+; overlap analysis. With a larger partial store limit, we remove the memset.
 define void @test4()  {
 ; CHECK-LABEL: @test4(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[BANG:%.*]] = alloca [[STRUCT_FOOSTRUCT:%.*]], align 8
+; CHECK-NEXT:    [[V1:%.*]] = bitcast %struct.foostruct* [[BANG]] to i8*
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[V1]], i64 32
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP0]], i8 0, i64 8, i1 false)
 ; CHECK-NEXT:    [[V2:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 0
 ; CHECK-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V2]], align 8
 ; CHECK-NEXT:    [[V3:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 1
@@ -226,6 +312,22 @@
 ; CHECK-NEXT:    call void @goFunc(%struct.foostruct* [[BANG]])
 ; CHECK-NEXT:    ret void
 ;
+; LARGER-LIMIT-LABEL: @test4(
+; LARGER-LIMIT-NEXT:  entry:
+; LARGER-LIMIT-NEXT:    [[BANG:%.*]] = alloca [[STRUCT_FOOSTRUCT:%.*]], align 8
+; LARGER-LIMIT-NEXT:    [[V2:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 0
+; LARGER-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V2]], align 8
+; LARGER-LIMIT-NEXT:    [[V3:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 1
+; LARGER-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V3]], align 8
+; LARGER-LIMIT-NEXT:    [[V4:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 2
+; LARGER-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V4]], align 8
+; LARGER-LIMIT-NEXT:    [[V5:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 3
+; LARGER-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V5]], align 8
+; LARGER-LIMIT-NEXT:    [[V6:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 4
+; LARGER-LIMIT-NEXT:    store void (i8*, i32, i32)* null, void (i8*, i32, i32)** [[V6]], align 8
+; LARGER-LIMIT-NEXT:    call void @goFunc(%struct.foostruct* [[BANG]])
+; LARGER-LIMIT-NEXT:    ret void
+;
 entry:
 
   %bang = alloca %struct.foostruct, align 8
@@ -261,6 +363,20 @@
 ; CHECK-NEXT:    store i16 1346, i16* [[WPTR]], align 1
 ; CHECK-NEXT:    ret i8 0
 ;
+; LARGER-LIMIT-LABEL: @test5(
+; LARGER-LIMIT-NEXT:  entry:
+; LARGER-LIMIT-NEXT:    [[BPTR:%.*]] = bitcast i32* [[PTR:%.*]] to i8*
+; LARGER-LIMIT-NEXT:    [[BPTR1:%.*]] = getelementptr inbounds i8, i8* [[BPTR]], i64 1
+; LARGER-LIMIT-NEXT:    [[BPTR2:%.*]] = getelementptr inbounds i8, i8* [[BPTR]], i64 2
+; LARGER-LIMIT-NEXT:    [[BPTR3:%.*]] = getelementptr inbounds i8, i8* [[BPTR]], i64 3
+; LARGER-LIMIT-NEXT:    [[WPTR:%.*]] = bitcast i8* [[BPTR]] to i16*
+; LARGER-LIMIT-NEXT:    [[WPTR1:%.*]] = bitcast i8* [[BPTR1]] to i16*
+; LARGER-LIMIT-NEXT:    [[WPTR2:%.*]] = bitcast i8* [[BPTR2]] to i16*
+; LARGER-LIMIT-NEXT:    store i16 -1, i16* [[WPTR2]], align 1
+; LARGER-LIMIT-NEXT:    store i16 1456, i16* [[WPTR1]], align 1
+; LARGER-LIMIT-NEXT:    store i16 1346, i16* [[WPTR]], align 1
+; LARGER-LIMIT-NEXT:    ret i8 0
+;
 entry:
 
   store i32 0, i32* %ptr
@@ -292,6 +408,15 @@
 ; CHECK-NEXT:    store i16 -1, i16* [[BPTR1]], align 1
 ; CHECK-NEXT:    ret i8 0
 ;
+; LARGER-LIMIT-LABEL: @test6(
+; LARGER-LIMIT-NEXT:  entry:
+; LARGER-LIMIT-NEXT:    [[BPTR:%.*]] = bitcast i32* [[PTR:%.*]] to i16*
+; LARGER-LIMIT-NEXT:    [[BPTR1:%.*]] = getelementptr inbounds i16, i16* [[BPTR]], i64 0
+; LARGER-LIMIT-NEXT:    [[BPTR2:%.*]] = getelementptr inbounds i16, i16* [[BPTR]], i64 1
+; LARGER-LIMIT-NEXT:    store i16 1456, i16* [[BPTR2]], align 1
+; LARGER-LIMIT-NEXT:    store i16 -1, i16* [[BPTR1]], align 1
+; LARGER-LIMIT-NEXT:    ret i8 0
+;
 entry:
 
   store i32 0, i32* %ptr
@@ -321,6 +446,19 @@
 ; CHECK-NEXT:    store i16 5656, i16* [[BPTR4]], align 1
 ; CHECK-NEXT:    ret i8 0
 ;
+; LARGER-LIMIT-LABEL: @test7(
+; LARGER-LIMIT-NEXT:  entry:
+; LARGER-LIMIT-NEXT:    [[BPTR:%.*]] = bitcast i64* [[PTR:%.*]] to i16*
+; LARGER-LIMIT-NEXT:    [[BPTR1:%.*]] = getelementptr inbounds i16, i16* [[BPTR]], i64 0
+; LARGER-LIMIT-NEXT:    [[BPTR2:%.*]] = getelementptr inbounds i16, i16* [[BPTR]], i64 1
+; LARGER-LIMIT-NEXT:    [[BPTR3:%.*]] = getelementptr inbounds i16, i16* [[BPTR]], i64 2
+; LARGER-LIMIT-NEXT:    [[BPTR4:%.*]] = getelementptr inbounds i16, i16* [[BPTR]], i64 3
+; LARGER-LIMIT-NEXT:    store i16 1346, i16* [[BPTR1]], align 1
+; LARGER-LIMIT-NEXT:    store i16 1756, i16* [[BPTR3]], align 1
+; LARGER-LIMIT-NEXT:    store i16 1456, i16* [[BPTR2]], align 1
+; LARGER-LIMIT-NEXT:    store i16 5656, i16* [[BPTR4]], align 1
+; LARGER-LIMIT-NEXT:    ret i8 0
+;
 entry:
 
   store i64 0, i64* %ptr
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/debug-counter.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/debug-counter.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/debug-counter.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/debug-counter.ll
@@ -1,7 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 
-; XFAIL: *
-
 ; REQUIRES: asserts
 
 ; Eliminates store to %R in the entry block.
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll
@@ -1,7 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 
-; XFAIL: *
-
 ; RUN: opt < %s -basic-aa -dse -enable-dse-memoryssa -S | FileCheck --check-prefix=NO-LIMIT %s
 ; RUN: opt < %s -basic-aa -dse -enable-dse-memoryssa -dse-memoryssa-scanlimit=0 -S | FileCheck --check-prefix=LIMIT-0 %s
 ; RUN: opt < %s -basic-aa -dse -enable-dse-memoryssa -dse-memoryssa-scanlimit=2 -S | FileCheck --check-prefix=LIMIT-2 %s
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -dse -enable-dse-memoryssa %s -S | FileCheck %s
+; RUN: opt -dse -enable-dse-memoryssa -dse-memoryssa-partial-store-limit=10 %s -S | FileCheck --check-prefix=LARGER-LIMIT %s
 
 
 %struct.ham = type { [3 x double], [3 x double]}
@@ -7,6 +8,8 @@
 declare void @may_throw()
 declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg)
 
+; We miss this case, because of aggressive an aggressive limit of partial
+; overlap analysis.
 define void @overlap1(%struct.ham* %arg, i1 %cond) {
 ; CHECK-LABEL: @overlap1(
 ; CHECK-NEXT:  bb:
@@ -16,6 +19,9 @@
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[TMP2]] to i8*
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i64 32
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* nonnull align 8 dereferenceable(48) [[TMP0]], i8 0, i64 16, i1 false)
 ; CHECK-NEXT:    br i1 [[COND:%.*]], label [[BB7:%.*]], label [[BB8:%.*]]
 ; CHECK:       bb7:
 ; CHECK-NEXT:    br label [[BB9:%.*]]
@@ -30,6 +36,28 @@
 ; CHECK-NEXT:    store double 6.000000e+00, double* [[TMP3]], align 8
 ; CHECK-NEXT:    ret void
 ;
+; LARGER-LIMIT-LABEL: @overlap1(
+; LARGER-LIMIT-NEXT:  bb:
+; LARGER-LIMIT-NEXT:    [[TMP:%.*]] = getelementptr inbounds [[STRUCT_HAM:%.*]], %struct.ham* [[ARG:%.*]], i64 0, i32 0, i64 2
+; LARGER-LIMIT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 1
+; LARGER-LIMIT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 0
+; LARGER-LIMIT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 2
+; LARGER-LIMIT-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 1
+; LARGER-LIMIT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i32 0
+; LARGER-LIMIT-NEXT:    br i1 [[COND:%.*]], label [[BB7:%.*]], label [[BB8:%.*]]
+; LARGER-LIMIT:       bb7:
+; LARGER-LIMIT-NEXT:    br label [[BB9:%.*]]
+; LARGER-LIMIT:       bb8:
+; LARGER-LIMIT-NEXT:    br label [[BB9]]
+; LARGER-LIMIT:       bb9:
+; LARGER-LIMIT-NEXT:    store double 1.000000e+00, double* [[TMP2]], align 8
+; LARGER-LIMIT-NEXT:    store double 2.000000e+00, double* [[TMP1]], align 8
+; LARGER-LIMIT-NEXT:    store double 3.000000e+00, double* [[TMP]], align 8
+; LARGER-LIMIT-NEXT:    store double 4.000000e+00, double* [[TMP5]], align 8
+; LARGER-LIMIT-NEXT:    store double 5.000000e+00, double* [[TMP4]], align 8
+; LARGER-LIMIT-NEXT:    store double 6.000000e+00, double* [[TMP3]], align 8
+; LARGER-LIMIT-NEXT:    ret void
+;
 bb:
   %tmp = getelementptr inbounds %struct.ham, %struct.ham* %arg, i64 0, i32 0, i64 2
   %tmp1 = getelementptr inbounds %struct.ham, %struct.ham* %arg, i64 0, i32 0, i64 1
@@ -83,6 +111,31 @@
 ; CHECK-NEXT:    store double 6.000000e+00, double* [[TMP3]], align 8
 ; CHECK-NEXT:    ret void
 ;
+; LARGER-LIMIT-LABEL: @overlap2(
+; LARGER-LIMIT-NEXT:  bb:
+; LARGER-LIMIT-NEXT:    [[TMP:%.*]] = getelementptr inbounds [[STRUCT_HAM:%.*]], %struct.ham* [[ARG:%.*]], i64 0, i32 0, i64 2
+; LARGER-LIMIT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 1
+; LARGER-LIMIT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 0
+; LARGER-LIMIT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 2
+; LARGER-LIMIT-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 1
+; LARGER-LIMIT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i32 0
+; LARGER-LIMIT-NEXT:    [[TMP6:%.*]] = bitcast double* [[TMP2]] to i8*
+; LARGER-LIMIT-NEXT:    call void @llvm.memset.p0i8.i64(i8* nonnull align 8 dereferenceable(48) [[TMP6]], i8 0, i64 48, i1 false)
+; LARGER-LIMIT-NEXT:    br i1 [[COND:%.*]], label [[BB7:%.*]], label [[BB8:%.*]]
+; LARGER-LIMIT:       bb7:
+; LARGER-LIMIT-NEXT:    call void @may_throw()
+; LARGER-LIMIT-NEXT:    br label [[BB9:%.*]]
+; LARGER-LIMIT:       bb8:
+; LARGER-LIMIT-NEXT:    br label [[BB9]]
+; LARGER-LIMIT:       bb9:
+; LARGER-LIMIT-NEXT:    store double 1.000000e+00, double* [[TMP2]], align 8
+; LARGER-LIMIT-NEXT:    store double 2.000000e+00, double* [[TMP1]], align 8
+; LARGER-LIMIT-NEXT:    store double 3.000000e+00, double* [[TMP]], align 8
+; LARGER-LIMIT-NEXT:    store double 4.000000e+00, double* [[TMP5]], align 8
+; LARGER-LIMIT-NEXT:    store double 5.000000e+00, double* [[TMP4]], align 8
+; LARGER-LIMIT-NEXT:    store double 6.000000e+00, double* [[TMP3]], align 8
+; LARGER-LIMIT-NEXT:    ret void
+;
 bb:
   %tmp = getelementptr inbounds %struct.ham, %struct.ham* %arg, i64 0, i32 0, i64 2
   %tmp1 = getelementptr inbounds %struct.ham, %struct.ham* %arg, i64 0, i32 0, i64 1
@@ -127,6 +180,18 @@
 ; CHECK-NEXT:    store i16 0, i16* [[CAST_I16]], align 4
 ; CHECK-NEXT:    ret void
 ;
+; LARGER-LIMIT-LABEL: @overlap_no_dominance(
+; LARGER-LIMIT-NEXT:  bb:
+; LARGER-LIMIT-NEXT:    br i1 [[C:%.*]], label [[BB13:%.*]], label [[BB9:%.*]]
+; LARGER-LIMIT:       bb9:
+; LARGER-LIMIT-NEXT:    [[CAST_I32:%.*]] = bitcast [4 x i8]* [[ARG:%.*]] to i32*
+; LARGER-LIMIT-NEXT:    store i32 844283136, i32* [[CAST_I32]], align 4
+; LARGER-LIMIT-NEXT:    br label [[BB13]]
+; LARGER-LIMIT:       bb13:
+; LARGER-LIMIT-NEXT:    [[CAST_I16:%.*]] = bitcast [4 x i8]* [[ARG]] to i16*
+; LARGER-LIMIT-NEXT:    store i16 0, i16* [[CAST_I16]], align 4
+; LARGER-LIMIT-NEXT:    ret void
+;
 bb:
   br i1 %c, label %bb13, label %bb9
 
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll
@@ -477,10 +477,8 @@
   ret i32 0
 }
 
-; TODO
-; We can remove redundant store, as noalias %p guarantees that the function does
-; only access it via %p. This also holds for the call to unknown_func even though
-; it could unwind
+; We cannot remove any stores, because @unknown_func may unwind and the caller
+; may read %p while unwinding.
 define void @test34(i32* noalias %p) {
 ; CHECK-LABEL: @test34(
 ; CHECK-NEXT:    store i32 1, i32* [[P:%.*]], align 4
@@ -636,9 +634,10 @@
   ret void
 }
 
-; I think this case is currently handled incorrectly by memdeps dse
-; throwing should leave store i32 1, not remove from the free.
 declare void @free(i8* nocapture)
+
+; We cannot remove `store i32 1, i32* %p`, because @unknown_func may unwind
+; and the caller may read %p while unwinding.
 define void @test41(i32* noalias %P) {
 ; CHECK-LABEL: @test41(
 ; CHECK-NEXT:    [[P2:%.*]] = bitcast i32* [[P:%.*]] to i8*