diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -1626,17 +1626,23 @@
 // Check for any extra throws between SI and NI that block DSE.  This only
 // checks extra maythrows (those that arn't MemoryDef's). MemoryDef that may
 // throw are handled during the walk from one def to the next.
-bool mayThrowBetween(Instruction *SI, Instruction *NI, const Value *SILocUnd,
-                     SmallSetVector<const Value *, 16> &InvisibleToCaller,
-                     SmallPtrSetImpl<BasicBlock *> &ThrowingBlocks) {
+bool mayThrowBetween(
+    Instruction *SI, Instruction *NI, const Value *SILocUnd,
+    SmallSetVector<const Value *, 16> &InvisibleToCaller,
+    DenseMap<BasicBlock *, DenseMap<Instruction *, unsigned>> &ThrowingBlocks) {
   // First see if we can ignore it by using the fact that SI is an alloca/alloca
   // like object that is not visible to the caller during execution of the
   // function.
   if (SILocUnd && InvisibleToCaller.count(SILocUnd))
     return false;
 
-  if (SI->getParent() == NI->getParent())
-    return ThrowingBlocks.find(SI->getParent()) != ThrowingBlocks.end();
+  if (SI->getParent() == NI->getParent()) {
+    auto BlockMap = ThrowingBlocks.find(SI->getParent());
+    if (BlockMap == ThrowingBlocks.end())
+      return false;
+    return BlockMap->second[SI] !=
+           BlockMap->second[&*std::prev(NI->getIterator())];
+  }
   return !ThrowingBlocks.empty();
 }
 
@@ -1692,8 +1698,12 @@
   // Keep track of all of the objects that are invisible to the caller until the
   // function returns.
   SmallSetVector<const Value *, 16> InvisibleToCaller;
-  // Keep track of blocks with throwing instructions.
-  SmallPtrSet<BasicBlock *, 16> ThrowingBlocks;
+  // Keep track of blocks with throwing instructions. For each block with
+  // throwing instructions, we number all instructions in the block, starting
+  // with 0 and incrementing the counter for throwing instructions. If 2
+  // instructions have the same number there are no throwing instructions
+  // between them.
+  DenseMap<BasicBlock *, DenseMap<Instruction *, unsigned>> ThrowingBlocks;
 
   // Post-order numbers for basic blocks in the function.
   DenseMap<BasicBlock *, unsigned> PostOrderNumbers;
@@ -1702,9 +1712,13 @@
   // post-order numbers for basic blocks.
   for (BasicBlock *BB : post_order(&F)) {
     PostOrderNumbers[BB] = PO++;
+
+    unsigned ThrowingCnt = 0;
+    DenseMap<Instruction *, unsigned> BlockMap;
     for (Instruction &I : *BB) {
       if (I.mayThrow())
-        ThrowingBlocks.insert(I.getParent());
+        ThrowingCnt++;
+      BlockMap[&I] = ThrowingCnt;
 
       auto *MD = dyn_cast_or_null<MemoryDef>(MSSA.getMemoryAccess(&I));
       if (MD && hasAnalyzableMemoryWrite(&I, TLI) && isRemovable(&I))
@@ -1718,6 +1732,10 @@
         (isAllocLikeFn(&I, &TLI) && !PointerMayBeCaptured(&I, false, true)))
       InvisibleToCaller.insert(&I);
     }
+
+    // Only add the mapping for blocks with any throwing instructions.
+    if (ThrowingCnt != 0)
+      ThrowingBlocks[BB] = BlockMap;
   }
   // Treat byval or inalloca arguments the same as Allocas, stores to them are
   // dead at the end of the function.
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -dse -enable-dse-partial-store-merging=false < %s | FileCheck %s
+; RUN: opt -S -dse -enable-dse-memoryssa -enable-dse-partial-store-merging=false < %s | FileCheck %s
 target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64-bgq-linux"
 
@@ -252,6 +252,7 @@
 ; CHECK-NEXT:    [[BPTR:%.*]] = bitcast i32* [[PTR:%.*]] to i8*
 ; CHECK-NEXT:    [[BPTR1:%.*]] = getelementptr inbounds i8, i8* [[BPTR]], i64 1
 ; CHECK-NEXT:    [[BPTR2:%.*]] = getelementptr inbounds i8, i8* [[BPTR]], i64 2
+; CHECK-NEXT:    [[BPTR3:%.*]] = getelementptr inbounds i8, i8* [[BPTR]], i64 3
 ; CHECK-NEXT:    [[WPTR:%.*]] = bitcast i8* [[BPTR]] to i16*
 ; CHECK-NEXT:    [[WPTR1:%.*]] = bitcast i8* [[BPTR1]] to i16*
 ; CHECK-NEXT:    [[WPTR2:%.*]] = bitcast i8* [[BPTR2]] to i16*
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll
@@ -218,7 +218,7 @@
 ; it could unwind
 define void @test34(i32* noalias %p) {
 ; CHECK-LABEL: @test34(
-; CHECK-NEXT:    store i32 1, i32* [[P]]
+; CHECK-NEXT:    store i32 1, i32* [[P:%.*]]
 ; CHECK-NEXT:    call void @unknown_func()
 ; CHECK-NEXT:    store i32 0, i32* [[P]]
 ; CHECK-NEXT:    ret void
@@ -229,12 +229,10 @@
   ret void
 }
 
-; TODO
 ; Remove redundant store even with an unwinding function in the same block
 define void @test35(i32* noalias %p) {
 ; CHECK-LABEL: @test35(
 ; CHECK-NEXT:    call void @unknown_func()
-; CHECK-NEXT:    store i32 1, i32* [[P:%.*]]
 ; CHECK-NEXT:    store i32 0, i32* [[P:%.*]]
 ; CHECK-NEXT:    ret void
 ;
@@ -384,6 +382,14 @@
 ; NOCHECK-NEXT:    call void @unknown_func()
 ; NOCHECK-NEXT:    call void @free(i8* [[P2]])
 ; NOCHECK-NEXT:    ret void
+;
+; CHECK-LABEL: @test41(
+; CHECK-NEXT:    [[P2:%.*]] = bitcast i32* [[P:%.*]] to i8*
+; CHECK-NEXT:    store i32 1, i32* [[P]]
+; CHECK-NEXT:    call void @unknown_func()
+; CHECK-NEXT:    store i32 2, i32* [[P]]
+; CHECK-NEXT:    call void @free(i8* [[P2]])
+; CHECK-NEXT:    ret void
 ;
   %P2 = bitcast i32* %P to i8*
   store i32 1, i32* %P
@@ -400,6 +406,13 @@
 ; NOCHECK-NEXT:    store i32 2, i32* [[Q:%.*]]
 ; NOCHECK-NEXT:    store i8 3, i8* [[P2]]
 ; NOCHECK-NEXT:    ret void
+;
+; CHECK-LABEL: @test42(
+; CHECK-NEXT:    store i32 1, i32* [[P:%.*]]
+; CHECK-NEXT:    [[P2:%.*]] = bitcast i32* [[P]] to i8*
+; CHECK-NEXT:    store i32 2, i32* [[Q:%.*]]
+; CHECK-NEXT:    store i8 3, i8* [[P2]]
+; CHECK-NEXT:    ret void
 ;
   store i32 1, i32* %P
   %P2 = bitcast i32* %P to i8*
@@ -415,6 +428,13 @@
 ; NOCHECK-NEXT:    store atomic i32 2, i32* [[Q:%.*]] unordered, align 4
 ; NOCHECK-NEXT:    store atomic i8 3, i8* [[P2]] unordered, align 4
 ; NOCHECK-NEXT:    ret void
+;
+; CHECK-LABEL: @test42a(
+; CHECK-NEXT:    store atomic i32 1, i32* [[P:%.*]] unordered, align 4
+; CHECK-NEXT:    [[P2:%.*]] = bitcast i32* [[P]] to i8*
+; CHECK-NEXT:    store atomic i32 2, i32* [[Q:%.*]] unordered, align 4
+; CHECK-NEXT:    store atomic i8 3, i8* [[P2]] unordered, align 4
+; CHECK-NEXT:    ret void
 ;
   store atomic i32 1, i32* %P unordered, align 4
   %P2 = bitcast i32* %P to i8*