diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -29,7 +29,10 @@
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/OrderedBasicBlock.h"
+#include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
@@ -40,6 +43,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -87,6 +91,15 @@
   cl::init(true), cl::Hidden,
   cl::desc("Enable partial store merging in DSE"));
 
+static cl::opt<bool>
+    EnableMemorySSA("enable-dse-memoryssa", cl::init(false), cl::Hidden,
+                    cl::desc("Use the new MemorySSA-backed DSE."));
+
+static cl::opt<unsigned>
+    MemorySSAScanLimit("dse-memoryssa-scanlimit", cl::init(100), cl::Hidden,
+                       cl::desc("The number of memory instructions to scan for "
+                                "dead store elimination (default = 100)"));
+
 //===----------------------------------------------------------------------===//
 // Helper functions
 //===----------------------------------------------------------------------===//
@@ -1349,22 +1362,441 @@
   return MadeChange;
 }
 
+namespace {
+//=============================================================================
+// MemorySSA form of Dead store elimination.
+//
+// This uses MemorySSA to perform dead store elimination. MemorySSA will give
+// us a graph of MemoryAccesses which will either be Defs (stores), Uses (loads)
+// or Phis. The Uses will not be noalias of the Def they are a use of (unless
+// we hit the memoryssa optimistaion limit). Unfortately we cannot say the same
+// things of stores. So we walk forward from one store to the next, looking
+// until we hit a store (or collection of stores) that cause the original to be
+// dead.
+//
+// Because we are using MemorySSA we can also look across blocks. Anywhere where
+// there are multiple possible successors we treat as a break. Otherwise, so
+// long as the later store post-dominated the earlier one, the earlier one is
+// dead and can be removed.
+//
+// There are numerous other things we need to handle along the way.
+//  - Any instruction that may throw usually acts a dse barrier (So long as the
+//      memory is non-escaping)
+//  - We have to be careful about stores that may also be self-reads.
+//  - Atomics can be optimised (especially unordered), but are often not worth
+//      it. We treat fences like maythrows so that the block dse in the same
+//      manner.
+//
+// The above MemDep based methods will eventually be removed (if they are unused
+// by those below).
+
+enum class WalkType { Bad, Next };
+struct WalkResult {
+  WalkType Type;
+  MemoryDef *Next;
+
+  operator bool() { return Type != WalkType::Bad; }
+};
+
+Optional<MemoryLocation> getLocForWriteEx(Instruction *I,
+                                          const TargetLibraryInfo &TLI) {
+  if (!I->mayWriteToMemory())
+    return None;
+
+  if (auto *MTI = dyn_cast<AnyMemIntrinsic>(I))
+    return {MemoryLocation::getForDest(MTI)};
+
+  if (auto CS = CallSite(I)) {
+    if (Function *F = CS.getCalledFunction()) {
+      StringRef FnName = F->getName();
+      if (TLI.has(LibFunc_strcpy) && FnName == TLI.getName(LibFunc_strcpy))
+        return {MemoryLocation(CS.getArgument(0))};
+      if (TLI.has(LibFunc_strncpy) && FnName == TLI.getName(LibFunc_strncpy))
+        return {MemoryLocation(CS.getArgument(0))};
+      if (TLI.has(LibFunc_strcat) && FnName == TLI.getName(LibFunc_strcat))
+        return {MemoryLocation(CS.getArgument(0))};
+      if (TLI.has(LibFunc_strncat) && FnName == TLI.getName(LibFunc_strncat))
+        return {MemoryLocation(CS.getArgument(0))};
+    }
+    return None;
+  }
+
+  return MemoryLocation::getOrNone(I);
+}
+
+// Returns true if \p Use may read from \p DefLoc.
+bool isReadClobber(MemoryLocation DefLoc, Instruction *UseInst,
+                   AliasAnalysis &AA, const TargetLibraryInfo &TLI,
+                   DominatorTree &DT) {
+  if (!UseInst->mayReadFromMemory())
+    return false;
+
+  if (auto CS = CallSite(UseInst))
+    if (CS.onlyAccessesInaccessibleMemory())
+      return false;
+
+  ModRefInfo MR = AA.getModRefInfo(UseInst, DefLoc);
+  // If necessary, perform additional analysis.
+  if (isRefSet(MR))
+    MR = AA.callCapturesBefore(UseInst, DefLoc, &DT);
+  MR = clearMust(MR);
+  return isRefSet(MR);
+}
+
+// Returns true if \p Use may write to \p DefLoc.
+bool isWriteClobber(MemoryLocation DefLoc, Instruction *UseInst,
+                    AliasAnalysis &AA, const TargetLibraryInfo &TLI,
+                    DominatorTree &DT) {
+  if (!UseInst->mayWriteToMemory())
+    return false;
+
+  if (auto CS = CallSite(UseInst))
+    if (CS.onlyAccessesInaccessibleMemory())
+      return false;
+
+  ModRefInfo MR = AA.getModRefInfo(UseInst, DefLoc);
+  // If necessary, perform additional analysis.
+  if (isModSet(MR))
+    MR = AA.callCapturesBefore(UseInst, DefLoc, &DT);
+  MR = clearMust(MR);
+  return isModSet(MR);
+}
+
+// Returns true if \p M is an intrisnic that does not read or write memory.
+bool isNoopIntrinsic(MemoryUseOrDef *M) {
+  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(M->getMemoryInst())) {
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::lifetime_start:
+    case Intrinsic::lifetime_end:
+    case Intrinsic::invariant_end:
+    case Intrinsic::assume:
+    case Intrinsic::dbg_addr:
+    case Intrinsic::dbg_declare:
+    case Intrinsic::dbg_label:
+    case Intrinsic::dbg_value:
+      return true;
+    default:
+      return false;
+    }
+  }
+  return false;
+}
+
+// Find the next MemoryDef clobbering Current, if there is any, skipping things
+// that do not alias. WalkType::Bad indiciates that the walk hit one of the
+// following:
+//  * An aliasing MemoryUse (read).
+//  * A MemoryPHI that comes before Current.
+//  * Multiple MemoryDefs, which indicates a split in the control flow.
+WalkResult getNextMemoryDef(MemoryAccess *Current, MemoryLocation DefLoc,
+                            Instruction *DefObj, bool DefVisibleToCaller,
+                            int ScanLimit, AliasAnalysis &AA,
+                            const TargetLibraryInfo &TLI, DominatorTree &DT) {
+  if (--ScanLimit < 0)
+    return {WalkType::Bad, nullptr};
+
+  MemoryAccess *NextAccess = nullptr;
+  // Scan the uses to see if we have a single interesting MemoryAccess.
+  for (Use &U : Current->uses()) {
+    MemoryAccess *UseAccess = cast<MemoryAccess>(U.getUser());
+    // Bail out on MemoryPhis for now.
+    if (isa<MemoryPhi>(UseAccess))
+      return {WalkType::Bad, nullptr};
+
+    Instruction *UseInst = cast<MemoryUseOrDef>(UseAccess)->getMemoryInst();
+    LLVM_DEBUG(dbgs() << "   Checking use " << *UseInst << "\n");
+    // We can remove the dead stores, irrespective of the fence and its ordering
+    // (release/acquire/seq_cst). Fences only constraints the ordering of
+    // already visible stores, it does not make a store visible to other
+    // threads. So, skipping over a fence does not change a store from being
+    // dead.
+    if (isa<FenceInst>(UseInst))
+      continue;
+
+    // We cannot eliminate stores to locations visible to the caller across
+    // throwing instructions. We can also eliminate stores across throwing calls
+    // for alloc-like objects that do not escape before the current access.
+    if (UseInst->mayThrow() && DefVisibleToCaller) {
+      if (!DefObj ||
+          (!isAllocLikeFn(DefObj, &TLI) ||
+           PointerMayBeCapturedBefore(DefObj, false, true, UseInst, &DT)))
+        return {WalkType::Bad, nullptr};
+    }
+
+    // Skip intrinsics that do not really read or modify memory.
+    if (isNoopIntrinsic(cast<MemoryUseOrDef>(UseAccess))) {
+      if (isa<MemoryDef>(UseAccess))
+        NextAccess = UseAccess;
+      continue;
+    }
+
+    // Uses which may read the original MemoryDef mean we cannot eliminate the
+    // original MD. Stop walk.
+    if (isReadClobber(DefLoc, UseInst, AA, TLI, DT))
+      return {WalkType::Bad, nullptr};
+
+    if (isa<MemoryUse>(UseAccess)) {
+      // Skip non-aliasing MemoryUses.
+      continue;
+    }
+
+    // Multiple definitions, bail out.
+    if (NextAccess)
+      return {WalkType::Bad, nullptr};
+    NextAccess = UseAccess;
+  }
+
+  // If we found a next MemoryAccess, we are done if it clobbers the original
+  // def or it is an unrelated MemoryAccess and we continue the walk starting at
+  // NextAccess.
+  if (NextAccess) {
+    assert(isa<MemoryDef>(NextAccess) && "Only MemoryDefs supported for now");
+
+    if (!isNoopIntrinsic(cast<MemoryUseOrDef>(NextAccess))) {
+      if (isWriteClobber(DefLoc,
+                         cast<MemoryUseOrDef>(NextAccess)->getMemoryInst(), AA,
+                         TLI, DT))
+        return {WalkType::Next, cast<MemoryDef>(NextAccess)};
+    }
+    return getNextMemoryDef(NextAccess, DefLoc, DefObj, DefVisibleToCaller,
+                            ScanLimit, AA, TLI, DT);
+  }
+
+  // No aliasing definition found, bail out.
+  return {WalkType::Bad, nullptr};
+}
+
+// Delete dead memory defs
+void deleteDeadInstruction(Instruction *SI, InstOverlapIntervalsTy &IOL,
+                           MemorySSA &MSSA, const TargetLibraryInfo &TLI,
+                           SmallPtrSetImpl<MemoryDef *> &SkipStores) {
+  MemorySSAUpdater Updater(&MSSA);
+  SmallVector<Instruction *, 32> NowDeadInsts;
+  NowDeadInsts.push_back(SI);
+  --NumFastOther;
+
+  while (!NowDeadInsts.empty()) {
+    Instruction *DeadInst = NowDeadInsts.pop_back_val();
+    ++NumFastOther;
+
+    // Remove the Instruction from MSSA and IOL
+    if (MemoryAccess *MA = MSSA.getMemoryAccess(DeadInst)) {
+      Updater.removeMemoryAccess(MA);
+      if (MemoryDef *MD = dyn_cast<MemoryDef>(MA)) {
+        SkipStores.insert(MD);
+      }
+    }
+    IOL.erase(DeadInst);
+
+    // Remove it's operands
+    for (Use &O : DeadInst->operands())
+      if (Instruction *OpI = dyn_cast<Instruction>(O)) {
+        O = nullptr;
+        if (isInstructionTriviallyDead(OpI, &TLI))
+          NowDeadInsts.push_back(OpI);
+      }
+
+    DeadInst->eraseFromParent();
+  }
+}
+
+// Check for any extra throws between SI and NI that block DSE.  This only
+// checks extra maythrows (those that arn't MemoryDef's). MemoryDef that may
+// throw are handled during the walk from one def to the next.
+bool mayThrowBetween(Instruction *SI, Instruction *NI, const Value *SILocUnd,
+                     SmallSetVector<const Value *, 16> &InvisibleToCaller,
+                     SmallPtrSetImpl<BasicBlock *> &ThrowingBlocks) {
+  // First see if we can ignore it by using the fact that SI is an alloca/alloca
+  // like object that is not visible to the caller during execution of the
+  // function.
+  if (SILocUnd && InvisibleToCaller.count(SILocUnd))
+    return false;
+
+  if (SI->getParent() == NI->getParent())
+    return ThrowingBlocks.find(SI->getParent()) != ThrowingBlocks.end();
+  return !ThrowingBlocks.empty();
+}
+
+// Check if \p NI acts as a DSE barrier for \p SI. The following instructions
+// act as barriers:
+//  * A memory instruction that may throw and \p SI accesses a non-stack object.
+//  * Atomic instructions stronger that monotonic.
+bool isDSEBarrier(Instruction *SI, MemoryLocation &SILoc, const Value *SILocUnd,
+                  Instruction *NI, MemoryLocation &NILoc,
+                  SmallSetVector<const Value *, 16> &InvisibleToCaller,
+                  AliasAnalysis &AA, const TargetLibraryInfo &TLI) {
+  // If NI may throw it acts as a barrier, unless we are to an alloca/alloca
+  // like object that does not escape.
+  if (NI->mayThrow() && !InvisibleToCaller.count(SILocUnd))
+    return true;
+
+  if (NI->isAtomic()) {
+    if (auto *NSI = dyn_cast<StoreInst>(NI)) {
+      if (isStrongerThanMonotonic(NSI->getOrdering()))
+        return true;
+    } else if (auto *NLI = dyn_cast<LoadInst>(NI)) {
+      if (isStrongerThanMonotonic(NLI->getOrdering()) ||
+          !AA.isNoAlias(MemoryLocation::get(NLI), SILoc))
+        return true;
+    } else if (isa<FenceInst>(NI))
+      // We can remove the dead stores, irrespective of the fence and its
+      // ordering (release/acquire/seq_cst). Fences only constraints the
+      // ordering of already visible stores, it does not make a store visible to
+      // other threads. So, skipping over a fence does not change a store from
+      // being dead. We treat fences as maythrows, to be conservative so that
+      // they block optimizations in the same way
+      return false;
+    else
+      return true;
+  }
+
+  return false;
+}
+
+bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
+                                  MemorySSA &MSSA, DominatorTree &DT,
+                                  PostDominatorTree &PDT,
+                                  const TargetLibraryInfo &TLI) {
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  bool MadeChange = false;
+
+  // All MemoryDef stores
+  SmallVector<MemoryDef *, 4> Stores;
+  // Any that should be skipped as they are already deleted
+  SmallPtrSet<MemoryDef *, 4> SkipStores;
+  // A map of interval maps representing partially-overwritten value parts
+  InstOverlapIntervalsTy IOL;
+  // Keep track of all of the objects that are invisible to the caller until the
+  // function returns.
+  SmallSetVector<const Value *, 16> InvisibleToCaller;
+  // Keep track of blocks with throwing instructions not modeled in MemorySSA.
+  SmallPtrSet<BasicBlock *, 16> ThrowingBlocks;
+
+  // Collect blocks with throwing instructions not modeled in MemroySSA and
+  // alloc-like objects.
+  for (Instruction &I : instructions(F)) {
+    if (I.mayThrow() && !MSSA.getMemoryAccess(&I))
+      ThrowingBlocks.insert(I.getParent());
+
+    auto *MD = dyn_cast_or_null<MemoryDef>(MSSA.getMemoryAccess(&I));
+    if (MD && hasAnalyzableMemoryWrite(&I, TLI) && isRemovable(&I))
+      Stores.push_back(MD);
+
+    // Track alloca and alloca-like objects. Here we care about objects not
+    // visible to the caller during function execution. Alloca objects are
+    // invalid in the caller, for alloca-like objects we ensure that they are
+    // not captured throughout the function.
+    if (isa<AllocaInst>(&I) ||
+        (isAllocLikeFn(&I, &TLI) && !PointerMayBeCaptured(&I, false, true)))
+      InvisibleToCaller.insert(&I);
+  }
+  // Treat byval or inalloca arguments the same as Allocas, stores to them are
+  // dead at the end of the function.
+  for (Argument &AI : F.args())
+    if (AI.hasByValOrInAllocaAttr())
+      InvisibleToCaller.insert(&AI);
+
+  // For each store:
+  while (!Stores.empty()) {
+    MemoryDef *SIMD = Stores.back();
+    Stores.pop_back();
+    if (SkipStores.count(SIMD))
+      continue;
+    Instruction *SI = SIMD->getMemoryInst();
+    MemoryLocation SILoc = *getLocForWriteEx(SI, TLI);
+    assert(SILoc.Ptr && "SILoc should not be null");
+    const Value *SILocUnd = GetUnderlyingObject(SILoc.Ptr, DL);
+    Instruction *DefObj =
+        const_cast<Instruction *>(dyn_cast<Instruction>(SILocUnd));
+    bool LocVisibleToCaller = !InvisibleToCaller.count(SILocUnd);
+
+    LLVM_DEBUG(dbgs() << "Trying to eliminate " << *SI << "\n");
+
+    // Walk MemorySSA forward to find a MemoryDef that kills SI.
+    WalkResult Next;
+    while ((Next = getNextMemoryDef(SIMD, SILoc, DefObj, LocVisibleToCaller,
+                                    MemorySSAScanLimit, AA, TLI, DT))) {
+      MemoryAccess *NextAccess = Next.Next;
+      LLVM_DEBUG(dbgs() << " Checking " << *NextAccess << "\n");
+      MemoryDef *ND = dyn_cast<MemoryDef>(NextAccess);
+      Instruction *NI = ND->getMemoryInst();
+      LLVM_DEBUG(dbgs() << "  def " << *NI << "\n");
+
+      if (!hasAnalyzableMemoryWrite(NI, TLI))
+        break;
+      MemoryLocation NILoc = *getLocForWriteEx(NI, TLI);
+      // Check for anything that looks like it will be a barrier to further
+      // removal
+      if (isDSEBarrier(SI, SILoc, SILocUnd, NI, NILoc, InvisibleToCaller, AA,
+                       TLI)) {
+        LLVM_DEBUG(dbgs() << "  stop, barrier\n");
+        break;
+      }
+
+      // We must post-dom the instructions to safely remove it
+      if (!PDT.dominates(ND->getBlock(), SIMD->getBlock())) {
+        SIMD = cast<MemoryDef>(NextAccess);
+        LLVM_DEBUG(dbgs() << " continue, not post-dominating!\n");
+        continue;
+      }
+
+      // Before we try to remove anything, check for any extra throwing
+      // instructions that block us from DSEing
+      if (mayThrowBetween(SI, NI, SILocUnd, InvisibleToCaller,
+                          ThrowingBlocks)) {
+        LLVM_DEBUG(dbgs() << " stop, may throw!\n");
+        break;
+      }
+
+      // Get what type of overwrite this might be
+      int64_t InstWriteOffset, DepWriteOffset;
+      OverwriteResult OR = isOverwrite(NILoc, SILoc, DL, TLI, DepWriteOffset,
+                                       InstWriteOffset, SI, IOL, AA, &F);
+
+      if (OR == OW_Complete) {
+        LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: " << *SI
+                          << "\n  KILLER: " << *NI << '\n');
+        deleteDeadInstruction(SI, IOL, MSSA, TLI, SkipStores);
+        ++NumFastStores;
+        MadeChange = true;
+        break;
+      }
+      SIMD = cast<MemoryDef>(NextAccess);
+    }
+  }
+
+  return MadeChange;
+}
+} // end anonymous namespace
+
 //===----------------------------------------------------------------------===//
 // DSE Pass
 //===----------------------------------------------------------------------===//
 PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
-  AliasAnalysis *AA = &AM.getResult<AAManager>(F);
-  DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
-  MemoryDependenceResults *MD = &AM.getResult<MemoryDependenceAnalysis>(F);
-  const TargetLibraryInfo *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
+  AliasAnalysis &AA = AM.getResult<AAManager>(F);
+  const TargetLibraryInfo &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
 
-  if (!eliminateDeadStores(F, AA, MD, DT, TLI))
-    return PreservedAnalyses::all();
+  if (EnableMemorySSA) {
+    MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
+    PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
+
+    if (!eliminateDeadStoresMemorySSA(F, AA, MSSA, DT, PDT, TLI))
+      return PreservedAnalyses::all();
+  } else {
+    MemoryDependenceResults &MD = AM.getResult<MemoryDependenceAnalysis>(F);
+
+    if (!eliminateDeadStores(F, &AA, &MD, &DT, &TLI))
+      return PreservedAnalyses::all();
+  }
 
   PreservedAnalyses PA;
   PA.preserveSet<CFGAnalyses>();
   PA.preserve<GlobalsAA>();
-  PA.preserve<MemoryDependenceAnalysis>();
+  if (EnableMemorySSA)
+    PA.preserve<MemorySSAAnalysis>();
+  else
+    PA.preserve<MemoryDependenceAnalysis>();
   return PA;
 }
 
@@ -1383,25 +1815,42 @@
     if (skipFunction(F))
       return false;
 
-    DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-    AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
-    MemoryDependenceResults *MD =
-        &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
-    const TargetLibraryInfo *TLI =
-        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    const TargetLibraryInfo &TLI =
+        getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+
+    if (EnableMemorySSA) {
+      MemorySSA &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
+      PostDominatorTree &PDT =
+          getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+
+      return eliminateDeadStoresMemorySSA(F, AA, MSSA, DT, PDT, TLI);
+    } else {
+      MemoryDependenceResults &MD =
+          getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
 
-    return eliminateDeadStores(F, AA, MD, DT, TLI);
+      return eliminateDeadStores(F, &AA, &MD, &DT, &TLI);
+    }
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
-    AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<AAResultsWrapperPass>();
-    AU.addRequired<MemoryDependenceWrapperPass>();
     AU.addRequired<TargetLibraryInfoWrapperPass>();
-    AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
-    AU.addPreserved<MemoryDependenceWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+
+    if (EnableMemorySSA) {
+      AU.addRequired<PostDominatorTreeWrapperPass>();
+      AU.addRequired<MemorySSAWrapperPass>();
+      AU.addPreserved<PostDominatorTreeWrapperPass>();
+      AU.addPreserved<MemorySSAWrapperPass>();
+    } else {
+      AU.addRequired<MemoryDependenceWrapperPass>();
+      AU.addPreserved<MemoryDependenceWrapperPass>();
+    }
   }
 };
 
@@ -1412,8 +1861,10 @@
 INITIALIZE_PASS_BEGIN(DSELegacyPass, "dse", "Dead Store Elimination", false,
                       false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(DSELegacyPass, "dse", "Dead Store Elimination", false,
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/fence.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/fence-todo.ll
copy from llvm/test/Transforms/DeadStoreElimination/MSSA/fence.ll
copy to llvm/test/Transforms/DeadStoreElimination/MSSA/fence-todo.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/fence.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/fence-todo.ll
@@ -1,51 +1,6 @@
-; RUN: opt -S -basicaa -dse -enable-dse-memoryssa < %s | FileCheck %s
-
-; We conservative choose to prevent dead store elimination
-; across release or stronger fences.  It's not required 
-; (since the must still be a race on %addd.i), but
-; it is conservatively correct.  A legal optimization
-; could hoist the second store above the fence, and then
-; DSE one of them.
-define void @test1(i32* %addr.i) {
-; CHECK-LABEL: @test1
-; CHECK: store i32 5
-; CHECK: fence
-; CHECK: store i32 5
-; CHECK: ret
-  store i32 5, i32* %addr.i, align 4
-  fence release
-  store i32 5, i32* %addr.i, align 4
-  ret void
-}
-
-; Same as previous, but with different values.  If we ever optimize 
-; this more aggressively, this allows us to check that the correct
-; store is retained (the 'i32 1' store in this case)
-define void @test1b(i32* %addr.i) {
-; CHECK-LABEL: @test1b
-; CHECK: store i32 42
-; CHECK: fence release
-; CHECK: store i32 1
-; CHECK: ret
-  store i32 42, i32* %addr.i, align 4
-  fence release
-  store i32 1, i32* %addr.i, align 4
-  ret void
-}
+; XFAIL: *
 
-; We *could* DSE across this fence, but don't.  No other thread can
-; observe the order of the acquire fence and the store.
-define void @test2(i32* %addr.i) {
-; CHECK-LABEL: @test2
-; CHECK: store
-; CHECK: fence
-; CHECK: store
-; CHECK: ret
-  store i32 5, i32* %addr.i, align 4
-  fence acquire
-  store i32 5, i32* %addr.i, align 4
-  ret void
-}
+; RUN: opt -S -basicaa -dse -enable-dse-memoryssa < %s | FileCheck %s
 
 ; We DSE stack alloc'ed and byval locations, in the presence of fences.
 ; Fence does not make an otherwise thread local store visible.
@@ -93,4 +48,3 @@
   store i32 4, i32* %P1, align 4
   ret void
 }
-
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/fence.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/fence.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/fence.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/fence.ll
@@ -46,51 +46,3 @@
   store i32 5, i32* %addr.i, align 4
   ret void
 }
-
-; We DSE stack alloc'ed and byval locations, in the presence of fences.
-; Fence does not make an otherwise thread local store visible.
-; Right now the DSE in presence of fence is only done in end blocks (with no successors),
-; but the same logic applies to other basic blocks as well.
-; The store to %addr.i can be removed since it is a byval attribute
-define void @test3(i32* byval %addr.i) {
-; CHECK-LABEL: @test3
-; CHECK-NOT: store
-; CHECK: fence
-; CHECK: ret
-  store i32 5, i32* %addr.i, align 4
-  fence release
-  ret void
-}
-
-declare void @foo(i8* nocapture %p)
-
-declare noalias i8* @malloc(i32)
-
-; DSE of stores in locations allocated through library calls.
-define void @test_nocapture() {
-; CHECK-LABEL: @test_nocapture
-; CHECK: malloc
-; CHECK: foo
-; CHECK-NOT: store
-; CHECK: fence
-  %m  =  call i8* @malloc(i32 24)
-  call void @foo(i8* %m)
-  store i8 4, i8* %m
-  fence release
-  ret void
-}
-
-
-; This is a full fence, but it does not make a thread local store visible.
-; We can DSE the store in presence of the fence.
-define void @fence_seq_cst() {
-; CHECK-LABEL: @fence_seq_cst
-; CHECK-NEXT: fence seq_cst
-; CHECK-NEXT: ret void
-  %P1 = alloca i32
-  store i32 0, i32* %P1, align 4
-  fence seq_cst
-  store i32 4, i32* %P1, align 4
-  ret void
-}
-
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/memcpy-complete-overwrite.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/memcpy-complete-overwrite.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/memcpy-complete-overwrite.ll
@@ -0,0 +1,189 @@
+; XFAIL: *
+
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck %s
+; RUN: opt < %s -aa-pipeline=basic-aa -passes=dse -enable-dse-memoryssa -S | FileCheck %s
+target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
+declare void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* nocapture, i8, i64, i32) nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind
+declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
+
+; PR8701
+
+;; Fully dead overwrite of memcpy.
+define void @test15(i8* %P, i8* %Q) nounwind ssp {
+; CHECK-LABEL: @test15(
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P:%.*]], i8* [[Q:%.*]], i64 12, i1 false)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false)
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false)
+  ret void
+}
+
+;; Fully dead overwrite of memcpy.
+define void @test15_atomic(i8* %P, i8* %Q) nounwind ssp {
+; CHECK-LABEL: @test15_atomic(
+; CHECK-NEXT:    tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i32 1)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1)
+  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1)
+  ret void
+}
+
+;; Fully dead overwrite of memcpy.
+define void @test15_atomic_weaker(i8* %P, i8* %Q) nounwind ssp {
+; CHECK-LABEL: @test15_atomic_weaker(
+; CHECK-NEXT:    tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i32 1)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i1 false)
+  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1)
+  ret void
+}
+
+;; Fully dead overwrite of memcpy.
+define void @test15_atomic_weaker_2(i8* %P, i8* %Q) nounwind ssp {
+; CHECK-LABEL: @test15_atomic_weaker_2(
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i1 false)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1)
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i1 false)
+  ret void
+}
+
+;; Full overwrite of smaller memcpy.
+define void @test16(i8* %P, i8* %Q) nounwind ssp {
+; CHECK-LABEL: @test16(
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P:%.*]], i8* [[Q:%.*]], i64 12, i1 false)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 8, i1 false)
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false)
+  ret void
+}
+
+;; Full overwrite of smaller memcpy.
+define void @test16_atomic(i8* %P, i8* %Q) nounwind ssp {
+; CHECK-LABEL: @test16_atomic(
+; CHECK-NEXT:    tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i32 1)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 8, i32 1)
+  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1)
+  ret void
+}
+
+;; Full overwrite of smaller memory where overwrite has stronger atomicity
+define void @test16_atomic_weaker(i8* %P, i8* %Q) nounwind ssp {
+; CHECK-LABEL: @test16_atomic_weaker(
+; CHECK-NEXT:    tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i32 1)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 8, i1 false)
+  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1)
+  ret void
+}
+
+;; Full overwrite of smaller memory where overwrite has weaker atomicity.
+define void @test16_atomic_weaker_2(i8* %P, i8* %Q) nounwind ssp {
+; CHECK-LABEL: @test16_atomic_weaker_2(
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i1 false)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 8, i32 1)
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i1 false)
+  ret void
+}
+
+;; Overwrite of memset by memcpy.
+define void @test17(i8* %P, i8* noalias %Q) nounwind ssp {
+; CHECK-LABEL: @test17(
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P:%.*]], i8* [[Q:%.*]], i64 12, i1 false)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.memset.p0i8.i64(i8* %P, i8 42, i64 8, i1 false)
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false)
+  ret void
+}
+
+;; Overwrite of memset by memcpy.
+define void @test17_atomic(i8* %P, i8* noalias %Q) nounwind ssp {
+; CHECK-LABEL: @test17_atomic(
+; CHECK-NEXT:    tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i32 1)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %P, i8 42, i64 8, i32 1)
+  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1)
+  ret void
+}
+
+;; Overwrite of memset by memcpy. Overwrite is stronger atomicity. We can
+;; remove the memset.
+define void @test17_atomic_weaker(i8* %P, i8* noalias %Q) nounwind ssp {
+; CHECK-LABEL: @test17_atomic_weaker(
+; CHECK-NEXT:    tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i32 1)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.memset.p0i8.i64(i8* align 1 %P, i8 42, i64 8, i1 false)
+  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1)
+  ret void
+}
+
+;; Overwrite of memset by memcpy. Overwrite is weaker atomicity. We can remove
+;; the memset.
+define void @test17_atomic_weaker_2(i8* %P, i8* noalias %Q) nounwind ssp {
+; CHECK-LABEL: @test17_atomic_weaker_2(
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i1 false)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %P, i8 42, i64 8, i32 1)
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i1 false)
+  ret void
+}
+
+; Should not delete the volatile memset.
+define void @test17v(i8* %P, i8* %Q) nounwind ssp {
+; CHECK-LABEL: @test17v(
+; CHECK-NEXT:    tail call void @llvm.memset.p0i8.i64(i8* [[P:%.*]], i8 42, i64 8, i1 true)
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P]], i8* [[Q:%.*]], i64 12, i1 false)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.memset.p0i8.i64(i8* %P, i8 42, i64 8, i1 true)
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false)
+  ret void
+}
+
+; PR8728
+; Do not delete instruction where possible situation is:
+; A = B
+; A = A
+;
+; NB! See PR11763 - currently LLVM allows memcpy's source and destination to be
+; equal (but not inequal and overlapping).
+define void @test18(i8* %P, i8* %Q, i8* %R) nounwind ssp {
+; CHECK-LABEL: @test18(
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P:%.*]], i8* [[Q:%.*]], i64 12, i1 false)
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P]], i8* [[R:%.*]], i64 12, i1 false)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false)
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i1 false)
+  ret void
+}
+
+define void @test18_atomic(i8* %P, i8* %Q, i8* %R) nounwind ssp {
+; CHECK-LABEL: @test18_atomic(
+; CHECK-NEXT:    tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i32 1)
+; CHECK-NEXT:    tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P]], i8* align 1 [[R:%.*]], i64 12, i32 1)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1)
+  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %R, i64 12, i32 1)
+  ret void
+}
+
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/memcpy-lifetimes.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/memcpy-lifetimes.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/memcpy-lifetimes.ll
@@ -0,0 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -dse -S | FileCheck %s
+; RUN: opt < %s -aa-pipeline=basic-aa -passes=dse -S | FileCheck %s
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
+%struct.Village = type { [4 x %struct.Village*], %struct.Village*, %struct.List, %struct.Hosp, i32, i64 }
+%struct.List = type { %struct.List*, %struct.Patient*, %struct.List* }
+%struct.Patient = type { i32, i32, i32, %struct.Village* }
+%struct.Hosp = type { i32, i32, i32, %struct.List, %struct.List, %struct.List, %struct.List }
+
+declare %struct.Village* @alloc(%struct.Village*)
+
+define i8* @alloc_tree() {
+; CHECK-LABEL: @alloc_tree(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[FVAL:%.*]] = alloca [4 x %struct.Village*], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast [4 x %struct.Village*]* [[FVAL]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 32, i8* nonnull [[TMP0]])
+; CHECK-NEXT:    [[CALL:%.*]] = tail call dereferenceable_or_null(192) i8* @malloc(i64 192)
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[CALL]] to %struct.Village*
+; CHECK-NEXT:    [[CALL3:%.*]] = tail call %struct.Village* @alloc(%struct.Village* [[TMP1]])
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x %struct.Village*], [4 x %struct.Village*]* [[FVAL]], i64 0, i64 3
+; CHECK-NEXT:    store %struct.Village* [[CALL3]], %struct.Village** [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[CALL3_1:%.*]] = tail call %struct.Village* @alloc(%struct.Village* [[TMP1]])
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds [4 x %struct.Village*], [4 x %struct.Village*]* [[FVAL]], i64 0, i64 2
+; CHECK-NEXT:    store %struct.Village* [[CALL3_1]], %struct.Village** [[ARRAYIDX_1]], align 16
+; CHECK-NEXT:    [[CALL3_2:%.*]] = tail call %struct.Village* @alloc(%struct.Village* [[TMP1]])
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds [4 x %struct.Village*], [4 x %struct.Village*]* [[FVAL]], i64 0, i64 1
+; CHECK-NEXT:    store %struct.Village* [[CALL3_2]], %struct.Village** [[ARRAYIDX_2]], align 8
+; CHECK-NEXT:    [[CALL3_3:%.*]] = tail call %struct.Village* @alloc(%struct.Village* [[TMP1]])
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds [4 x %struct.Village*], [4 x %struct.Village*]* [[FVAL]], i64 0, i64 0
+; CHECK-NEXT:    store %struct.Village* [[CALL3_3]], %struct.Village** [[ARRAYIDX_3]], align 16
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(32) [[CALL]], i8* nonnull align 16 dereferenceable(32) [[TMP0]], i64 32, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 32, i8* nonnull [[TMP0]])
+; CHECK-NEXT:    ret i8* [[CALL]]
+;
+entry:
+  %fval = alloca [4 x %struct.Village*], align 16
+  %0 = bitcast [4 x %struct.Village*]* %fval to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* nonnull %0) #7
+  %call = tail call dereferenceable_or_null(192) i8* @malloc(i64 192) #8
+  %1 = bitcast i8* %call to %struct.Village*
+  %call3 = tail call %struct.Village* @alloc(%struct.Village* %1)
+  %arrayidx = getelementptr inbounds [4 x %struct.Village*], [4 x %struct.Village*]* %fval, i64 0, i64 3
+  store %struct.Village* %call3, %struct.Village** %arrayidx, align 8
+  %call3.1 = tail call %struct.Village* @alloc(%struct.Village* %1)
+  %arrayidx.1 = getelementptr inbounds [4 x %struct.Village*], [4 x %struct.Village*]* %fval, i64 0, i64 2
+  store %struct.Village* %call3.1, %struct.Village** %arrayidx.1, align 16
+  %call3.2 = tail call %struct.Village* @alloc(%struct.Village* %1)
+  %arrayidx.2 = getelementptr inbounds [4 x %struct.Village*], [4 x %struct.Village*]* %fval, i64 0, i64 1
+  store %struct.Village* %call3.2, %struct.Village** %arrayidx.2, align 8
+  %call3.3 = tail call %struct.Village* @alloc(%struct.Village* %1)
+  %arrayidx.3 = getelementptr inbounds [4 x %struct.Village*], [4 x %struct.Village*]* %fval, i64 0, i64 0
+  store %struct.Village* %call3.3, %struct.Village** %arrayidx.3, align 16
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(32) %call, i8* nonnull align 16 dereferenceable(32) %0, i64 32, i1 false)
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* nonnull %0) #7
+  ret i8* %call
+}
+
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture)
+declare noalias i8* @malloc(i64)
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture)
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg)
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck --check-prefix=NO-LIMIT %s
+; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -dse-memoryssa-scanlimit=0 -S | FileCheck --check-prefix=LIMIT-0 %s
+; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -dse-memoryssa-scanlimit=2 -S | FileCheck --check-prefix=LIMIT-2 %s
+; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -dse-memoryssa-scanlimit=3 -S | FileCheck --check-prefix=LIMIT-3 %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+
+define void @test2(i32* noalias %P, i32* noalias %Q, i32* noalias %R) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    store i32 0, i32* [[P:%.*]]
+; CHECK-NEXT:    ret void
+;
+; NO-LIMIT-LABEL: @test2(
+; NO-LIMIT-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; NO-LIMIT:       bb1:
+; NO-LIMIT-NEXT:    br label [[BB3:%.*]]
+; NO-LIMIT:       bb2:
+; NO-LIMIT-NEXT:    br label [[BB3]]
+; NO-LIMIT:       bb3:
+; NO-LIMIT-NEXT:    store i32 0, i32* [[Q:%.*]]
+; NO-LIMIT-NEXT:    store i32 0, i32* [[R:%.*]]
+; NO-LIMIT-NEXT:    store i32 0, i32* [[P:%.*]]
+; NO-LIMIT-NEXT:    ret void
+;
+; LIMIT-0-LABEL: @test2(
+; LIMIT-0-NEXT:    store i32 1, i32* [[P:%.*]]
+; LIMIT-0-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; LIMIT-0:       bb1:
+; LIMIT-0-NEXT:    br label [[BB3:%.*]]
+; LIMIT-0:       bb2:
+; LIMIT-0-NEXT:    br label [[BB3]]
+; LIMIT-0:       bb3:
+; LIMIT-0-NEXT:    store i32 0, i32* [[Q:%.*]]
+; LIMIT-0-NEXT:    store i32 0, i32* [[R:%.*]]
+; LIMIT-0-NEXT:    store i32 0, i32* [[P]]
+; LIMIT-0-NEXT:    ret void
+;
+; LIMIT-2-LABEL: @test2(
+; LIMIT-2-NEXT:    store i32 1, i32* [[P:%.*]]
+; LIMIT-2-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; LIMIT-2:       bb1:
+; LIMIT-2-NEXT:    br label [[BB3:%.*]]
+; LIMIT-2:       bb2:
+; LIMIT-2-NEXT:    br label [[BB3]]
+; LIMIT-2:       bb3:
+; LIMIT-2-NEXT:    store i32 0, i32* [[Q:%.*]]
+; LIMIT-2-NEXT:    store i32 0, i32* [[R:%.*]]
+; LIMIT-2-NEXT:    store i32 0, i32* [[P]]
+; LIMIT-2-NEXT:    ret void
+;
+; LIMIT-3-LABEL: @test2(
+; LIMIT-3-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; LIMIT-3:       bb1:
+; LIMIT-3-NEXT:    br label [[BB3:%.*]]
+; LIMIT-3:       bb2:
+; LIMIT-3-NEXT:    br label [[BB3]]
+; LIMIT-3:       bb3:
+; LIMIT-3-NEXT:    store i32 0, i32* [[Q:%.*]]
+; LIMIT-3-NEXT:    store i32 0, i32* [[R:%.*]]
+; LIMIT-3-NEXT:    store i32 0, i32* [[P:%.*]]
+; LIMIT-3-NEXT:    ret void
+;
+  store i32 1, i32* %P
+  br i1 true, label %bb1, label %bb2
+bb1:
+  br label %bb3
+bb2:
+  br label %bb3
+bb3:
+  store i32 0, i32* %Q
+  store i32 0, i32* %R
+  store i32 0, i32* %P
+  ret void
+}
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/memset-and-memcpy.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/memset-and-memcpy.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/memset-and-memcpy.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck %s
+; RUN: opt < %s -aa-pipeline=basic-aa -passes=dse -enable-dse-memoryssa -S | FileCheck %s
+target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
+declare void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* nocapture, i8, i64, i32) nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind
+declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
+declare void @llvm.init.trampoline(i8*, i8*, i8*)
+
+
+;; Overwrite of memset by memcpy.
+define void @test17(i8* %P, i8* noalias %Q) nounwind ssp {
+; CHECK-LABEL: @test17(
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P:%.*]], i8* [[Q:%.*]], i64 12, i1 false)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.memset.p0i8.i64(i8* %P, i8 42, i64 8, i1 false)
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false)
+  ret void
+}
+
+;; Overwrite of memset by memcpy.
+define void @test17_atomic(i8* %P, i8* noalias %Q) nounwind ssp {
+; CHECK-LABEL: @test17_atomic(
+; CHECK-NEXT:    tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i32 1)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %P, i8 42, i64 8, i32 1)
+  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1)
+  ret void
+}
+
+;; Overwrite of memset by memcpy. Overwrite is stronger atomicity. We can
+;; remove the memset.
+define void @test17_atomic_weaker(i8* %P, i8* noalias %Q) nounwind ssp {
+; CHECK-LABEL: @test17_atomic_weaker(
+; CHECK-NEXT:    tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i32 1)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.memset.p0i8.i64(i8* align 1 %P, i8 42, i64 8, i1 false)
+  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1)
+  ret void
+}
+
+;; Overwrite of memset by memcpy. Overwrite is weaker atomicity. We can remove
+;; the memset.
+define void @test17_atomic_weaker_2(i8* %P, i8* noalias %Q) nounwind ssp {
+; CHECK-LABEL: @test17_atomic_weaker_2(
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i1 false)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %P, i8 42, i64 8, i32 1)
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i1 false)
+  ret void
+}
+
+; Should not delete the volatile memset.
+define void @test17v(i8* %P, i8* %Q) nounwind ssp {
+; CHECK-LABEL: @test17v(
+; CHECK-NEXT:    tail call void @llvm.memset.p0i8.i64(i8* [[P:%.*]], i8 42, i64 8, i1 true)
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P]], i8* [[Q:%.*]], i64 12, i1 false)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.memset.p0i8.i64(i8* %P, i8 42, i64 8, i1 true)
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false)
+  ret void
+}
+
+; According to the current LangRef, memcpy's source and destination cannot
+; overlap, hence the first memcpy is dead.
+;
+; Previously this was not allowed (PR8728), also discussed in PR11763.
+define void @test18(i8* %P, i8* %Q, i8* %R) nounwind ssp {
+; CHECK-LABEL: @test18(
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P:%.*]], i8* [[R:%.*]], i64 12, i1 false)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false)
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i1 false)
+  ret void
+}
+
+define void @test18_atomic(i8* %P, i8* %Q, i8* %R) nounwind ssp {
+; CHECK-LABEL: @test18_atomic(
+; CHECK-NEXT:    tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[R:%.*]], i64 12, i32 1)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1)
+  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %R, i64 12, i32 1)
+  ret void
+}
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-captures.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-captures.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-captures.ll
@@ -0,0 +1,322 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+declare noalias i8* @malloc(i64)
+
+declare void @foo()
+declare void @capture(i8*)
+
+; Check that we do not remove the second store, as %m is returned.
+define i8* @test_return_captures_1() {
+; CHECK-LABEL: @test_return_captures_1(
+; CHECK-NEXT:    [[M:%.*]] = call i8* @malloc(i64 24)
+; CHECK-NEXT:    store i8 1, i8* [[M]]
+; CHECK-NEXT:    ret i8* [[M]]
+;
+  %m = call i8* @malloc(i64 24)
+  store i8 0, i8* %m
+  store i8 1, i8* %m
+  ret i8* %m
+}
+
+; Same as @test_return_captures_1, but across BBs.
+define i8* @test_return_captures_2() {
+; CHECK-LABEL: @test_return_captures_2(
+; CHECK-NEXT:    [[M:%.*]] = call i8* @malloc(i64 24)
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    store i8 1, i8* [[M]]
+; CHECK-NEXT:    ret i8* [[M]]
+;
+  %m = call i8* @malloc(i64 24)
+  store i8 0, i8* %m
+  br label %exit
+
+exit:
+  store i8 1, i8* %m
+  ret i8* %m
+}
+
+
+%S1 = type { i8 * }
+
+; We cannot remove the last store to %m, because it escapes by storing it to %E.
+define void @test_malloc_capture_1(%S1* %E) {
+; CHECK-LABEL: @test_malloc_capture_1(
+; CHECK-NEXT:    [[M:%.*]] = call i8* @malloc(i64 24)
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[F_PTR:%.*]] = getelementptr [[S1:%.*]], %S1* [[E:%.*]], i32 0, i32 0
+; CHECK-NEXT:    store i8* [[M]], i8** [[F_PTR]]
+; CHECK-NEXT:    store i8 1, i8* [[M]]
+; CHECK-NEXT:    ret void
+;
+  %m = call i8* @malloc(i64 24)
+  br label %exit
+
+exit:
+  %f.ptr = getelementptr %S1, %S1* %E, i32 0, i32 0
+  store i8* %m, i8** %f.ptr
+  store i8 1, i8* %m
+  ret void
+}
+
+; Check we do not eliminate either store. The first one cannot be eliminated,
+; due to the call of @capture. The second one because %m escapes.
+define i8* @test_malloc_capture_2() {
+; CHECK-LABEL: @test_malloc_capture_2(
+; CHECK-NEXT:    [[M:%.*]] = call i8* @malloc(i64 24)
+; CHECK-NEXT:    store i8 0, i8* [[M]]
+; CHECK-NEXT:    call void @capture(i8* [[M]])
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    store i8 1, i8* [[M]]
+; CHECK-NEXT:    ret i8* [[M]]
+;
+  %m = call i8* @malloc(i64 24)
+  store i8 0, i8* %m
+  call void @capture(i8* %m)
+  br label %exit
+
+exit:
+  store i8 1, i8* %m
+  ret i8* %m
+}
+
+; We can remove the first store store i8 0, i8* %m because there are no throwing
+; instructions between the 2 stores and also %m escapes after the killing store.
+define i8* @test_malloc_capture_3() {
+; CHECK-LABEL: @test_malloc_capture_3(
+; CHECK-NEXT:    [[M:%.*]] = call i8* @malloc(i64 24)
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    store i8 1, i8* [[M]]
+; CHECK-NEXT:    call void @capture(i8* [[M]])
+; CHECK-NEXT:    ret i8* [[M]]
+;
+  %m = call i8* @malloc(i64 24)
+  store i8 0, i8* %m
+  br label %exit
+
+exit:
+  store i8 1, i8* %m
+  call void @capture(i8* %m)
+  ret i8* %m
+}
+
+; TODO: We could remove the first store store i8 0, i8* %m because %m escapes
+; after the killing store.
+define i8* @test_malloc_capture_4() {
+; CHECK-LABEL: @test_malloc_capture_4(
+; CHECK-NEXT:    [[M:%.*]] = call i8* @malloc(i64 24)
+; CHECK-NEXT:    store i8 0, i8* [[M]]
+; CHECK-NEXT:    call void @may_throw_readnone()
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    store i8 1, i8* [[M]]
+; CHECK-NEXT:    call void @capture(i8* [[M]])
+; CHECK-NEXT:    ret i8* [[M]]
+;
+
+  %m = call i8* @malloc(i64 24)
+  store i8 0, i8* %m
+  call void @may_throw_readnone()
+  br label %exit
+
+exit:
+  store i8 1, i8* %m
+  call void @capture(i8* %m)
+  ret i8* %m
+}
+
+
+; We cannot remove the first store store i8 0, i8* %m because %m escapes
+; before the killing store and we may throw in between.
+define i8* @test_malloc_capture_5() {
+; CHECK-LABEL: @test_malloc_capture_5(
+; CHECK-NEXT:    [[M:%.*]] = call i8* @malloc(i64 24)
+; CHECK-NEXT:    call void @capture(i8* [[M]])
+; CHECK-NEXT:    store i8 0, i8* [[M]]
+; CHECK-NEXT:    call void @may_throw_readnone()
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    store i8 1, i8* [[M]]
+; CHECK-NEXT:    ret i8* [[M]]
+;
+
+  %m = call i8* @malloc(i64 24)
+  call void @capture(i8* %m)
+  store i8 0, i8* %m
+  call void @may_throw_readnone()
+  br label %exit
+
+exit:
+  store i8 1, i8* %m
+  ret i8* %m
+}
+
+
+; TODO: We could remove the first store 'store i8 0, i8* %m' even though there
+; is a throwing instruction between them, because %m escapes after the killing
+; store.
+define i8* @test_malloc_capture_6() {
+; CHECK-LABEL: @test_malloc_capture_6(
+; CHECK-NEXT:    [[M:%.*]] = call i8* @malloc(i64 24)
+; CHECK-NEXT:    store i8 0, i8* [[M]]
+; CHECK-NEXT:    call void @may_throw_readnone()
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    store i8 1, i8* [[M]]
+; CHECK-NEXT:    call void @capture(i8* [[M]])
+; CHECK-NEXT:    ret i8* [[M]]
+;
+
+  %m = call i8* @malloc(i64 24)
+  store i8 0, i8* %m
+  call void @may_throw_readnone()
+  br label %exit
+
+exit:
+  store i8 1, i8* %m
+  call void @capture(i8* %m)
+  ret i8* %m
+}
+
+; We can remove the first store 'store i8 0, i8* %m' even though there is a
+; throwing instruction between them, because %m escapes after the killing store.
+define i8* @test_malloc_capture_7() {
+; CHECK-LABEL: @test_malloc_capture_7(
+; CHECK-NEXT:    [[M:%.*]] = call i8* @malloc(i64 24)
+; CHECK-NEXT:    call void @may_throw()
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    store i8 1, i8* [[M]]
+; CHECK-NEXT:    call void @capture(i8* [[M]])
+; CHECK-NEXT:    ret i8* [[M]]
+;
+
+  %m = call i8* @malloc(i64 24)
+  store i8 0, i8* %m
+  call void @may_throw()
+  br label %exit
+
+exit:
+  store i8 1, i8* %m
+  call void @capture(i8* %m)
+  ret i8* %m
+}
+; TODO: Remove store in exit.
+; Stores to stack objects can be eliminated if they are not captured inside the function.
+define void @test_alloca_nocapture_1() {
+; CHECK-LABEL: @test_alloca_nocapture_1(
+; CHECK-NEXT:    [[M:%.*]] = alloca i8
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    store i8 1, i8* [[M]]
+; CHECK-NEXT:    ret void
+;
+  %m = alloca i8
+  store i8 0, i8* %m
+  call void @foo()
+  br label %exit
+
+exit:
+  store i8 1, i8* %m
+  ret void
+}
+
+; TODO: Remove store in exit.
+; Cannot remove first store i8 0, i8* %m, as the call to @capture captures the object.
+define void @test_alloca_capture_1() {
+; CHECK-LABEL: @test_alloca_capture_1(
+; CHECK-NEXT:    [[M:%.*]] = alloca i8
+; CHECK-NEXT:    store i8 0, i8* [[M]]
+; CHECK-NEXT:    call void @capture(i8* [[M]])
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    store i8 1, i8* [[M]]
+; CHECK-NEXT:    ret void
+;
+  %m = alloca i8
+  store i8 0, i8* %m
+  call void @capture(i8* %m)
+  br label %exit
+
+exit:
+  store i8 1, i8* %m
+  ret void
+}
+
+; TODO: Remove store at exit.
+; We can remove the last store to %m, even though it escapes because the alloca
+; becomes invalid after the function returns.
+define void @test_alloca_capture_2(%S1* %E) {
+; CHECK-LABEL: @test_alloca_capture_2(
+; CHECK-NEXT:    [[M:%.*]] = alloca i8
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[F_PTR:%.*]] = getelementptr [[S1:%.*]], %S1* [[E:%.*]], i32 0, i32 0
+; CHECK-NEXT:    store i8* [[M]], i8** [[F_PTR]]
+; CHECK-NEXT:    store i8 1, i8* [[M]]
+; CHECK-NEXT:    ret void
+;
+  %m = alloca i8
+  br label %exit
+
+exit:
+  %f.ptr = getelementptr %S1, %S1* %E, i32 0, i32 0
+  store i8* %m, i8** %f.ptr
+  store i8 1, i8* %m
+  ret void
+}
+
+; Readnone functions are not modeled in MemorySSA, but could throw.
+; Make sure we do not eliminate the first store 'store i8 2, i8* %call'
+define void @malloc_capture_throw_1() {
+; CHECK-LABEL: @malloc_capture_throw_1(
+; CHECK-NEXT:    [[CALL:%.*]] = call i8* @malloc(i64 1)
+; CHECK-NEXT:    call void @may_capture(i8* [[CALL]])
+; CHECK-NEXT:    store i8 2, i8* [[CALL]], align 1
+; CHECK-NEXT:    call void @may_throw_readnone()
+; CHECK-NEXT:    store i8 3, i8* [[CALL]], align 1
+; CHECK-NEXT:    ret void
+;
+  %call = call i8* @malloc(i64 1)
+  call void @may_capture(i8* %call)
+  store i8 2, i8* %call, align 1
+  call void @may_throw_readnone()
+  store i8 3, i8* %call, align 1
+  ret void
+}
+
+; Readnone functions are not modeled in MemorySSA, but could throw.
+; Make sure we do not eliminate the first store 'store i8 2, i8* %call'
+define void @malloc_capture_throw_2() {
+; CHECK-LABEL: @malloc_capture_throw_2(
+; CHECK-NEXT:    [[CALL:%.*]] = call i8* @malloc(i64 1)
+; CHECK-NEXT:    call void @may_capture(i8* [[CALL]])
+; CHECK-NEXT:    store i8 2, i8* [[CALL]], align 1
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    call void @may_throw_readnone()
+; CHECK-NEXT:    store i8 3, i8* [[CALL]], align 1
+; CHECK-NEXT:    ret void
+;
+  %call = call i8* @malloc(i64 1)
+  call void @may_capture(i8* %call)
+  store i8 2, i8* %call, align 1
+  br label %bb
+
+bb:
+  call void @may_throw_readnone()
+  store i8 3, i8* %call, align 1
+  ret void
+}
+
+
+declare void @may_capture(i8*)
+declare void @may_throw_readnone() readnone
+declare void @may_throw()
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-exceptions.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-exceptions.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-exceptions.ll
@@ -0,0 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+declare void @f()
+declare i32 @__CxxFrameHandler3(...)
+
+
+; Make sure we do not eliminate `store i32 20, i32* %sv`. Even though it is a store
+; to a stack object, we can read it in the landing/catchpad.
+define void @test12(i32* %p) personality i32 (...)* @__CxxFrameHandler3 {
+; CHECK-LABEL: @test12(
+; CHECK-NEXT:  block1:
+; CHECK-NEXT:    [[SV:%.*]] = alloca i32
+; CHECK-NEXT:    br label [[BLOCK2:%.*]]
+; CHECK:       block2:
+; CHECK-NEXT:    store i32 20, i32* [[SV]]
+; CHECK-NEXT:    invoke void @f()
+; CHECK-NEXT:    to label [[BLOCK3:%.*]] unwind label [[CATCH_DISPATCH:%.*]]
+; CHECK:       block3:
+; CHECK-NEXT:    store i32 30, i32* [[SV]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       catch.dispatch:
+; CHECK-NEXT:    [[CS1:%.*]] = catchswitch within none [label %catch] unwind label [[CLEANUP:%.*]]
+; CHECK:       catch:
+; CHECK-NEXT:    [[C:%.*]] = catchpad within [[CS1]] []
+; CHECK-NEXT:    [[LV:%.*]] = load i32, i32* [[SV]]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       cleanup:
+; CHECK-NEXT:    [[C1:%.*]] = cleanuppad within none []
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    store i32 40, i32* [[SV]]
+; CHECK-NEXT:    ret void
+;
+block1:
+  %sv = alloca i32
+  br label %block2
+
+block2:
+  store i32 20, i32* %sv
+  invoke void @f()
+  to label %block3 unwind label %catch.dispatch
+
+block3:
+  store i32 30, i32* %sv
+  br label %exit
+
+catch.dispatch:
+  %cs1 = catchswitch within none [label %catch] unwind label %cleanup
+
+catch:
+  %c = catchpad within %cs1 []
+  %lv = load i32, i32* %sv
+  br label %exit
+
+cleanup:
+  %c1 = cleanuppad within none []
+  br label %exit
+
+exit:
+  store i32 40, i32* %sv
+  ret void
+}
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll
@@ -0,0 +1,171 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+
+define void @test13(i32* noalias %P) {
+; CHECK-LABEL: @test13(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR:%.*]]
+; CHECK:       for:
+; CHECK-NEXT:    store i32 0, i32* [[P:%.*]]
+; CHECK-NEXT:    br i1 false, label [[FOR]], label [[END:%.*]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for
+for:
+  store i32 0, i32* %P
+  br i1 false, label %for, label %end
+end:
+  ret void
+}
+
+
+define void @test14(i32* noalias %P) {
+; CHECK-LABEL: @test14(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store i32 1, i32* [[P:%.*]]
+; CHECK-NEXT:    br label [[FOR:%.*]]
+; CHECK:       for:
+; CHECK-NEXT:    store i32 0, i32* [[P]]
+; CHECK-NEXT:    br i1 false, label [[FOR]], label [[END:%.*]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  store i32 1, i32* %P
+  br label %for
+for:
+  store i32 0, i32* %P
+  br i1 false, label %for, label %end
+end:
+  ret void
+}
+
+define void @test18(i32* noalias %P) {
+; CHECK-LABEL: @test18(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P2:%.*]] = bitcast i32* [[P:%.*]] to i8*
+; CHECK-NEXT:    store i32 0, i32* [[P]]
+; CHECK-NEXT:    br label [[FOR:%.*]]
+; CHECK:       for:
+; CHECK-NEXT:    store i8 1, i8* [[P2]]
+; CHECK-NEXT:    [[X:%.*]] = load i32, i32* [[P]]
+; CHECK-NEXT:    store i8 2, i8* [[P2]]
+; CHECK-NEXT:    br i1 false, label [[FOR]], label [[END:%.*]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %P2 = bitcast i32* %P to i8*
+  store i32 0, i32* %P
+  br label %for
+for:
+  store i8 1, i8* %P2
+  %x = load i32, i32* %P
+  store i8 2, i8* %P2
+  br i1 false, label %for, label %end
+end:
+  ret void
+}
+
+define void @test21(i32* noalias %P) {
+; CHECK-LABEL: @test21(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i1 false)
+; CHECK-NEXT:    br label [[FOR:%.*]]
+; CHECK:       for:
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
+; CHECK-NEXT:    store i32 1, i32* [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    br i1 false, label [[FOR]], label [[END:%.*]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+  %p3 = bitcast i32* %arrayidx0 to i8*
+  call void @llvm.memset.p0i8.i64(i8* %p3, i8 0, i64 28, i32 4, i1 false)
+  br label %for
+for:
+  %arrayidx1 = getelementptr inbounds i32, i32* %P, i64 1
+  store i32 1, i32* %arrayidx1, align 4
+  br i1 false, label %for, label %end
+end:
+  ret void
+}
+
+define void @test_loop(i32 %N, i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %x, i32* noalias nocapture %b) local_unnamed_addr {
+; CHECK-LABEL: @test_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP27:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP27]], label [[FOR_BODY4_LR_PH_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body4.lr.ph.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY4_LR_PH:%.*]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body4.lr.ph:
+; CHECK-NEXT:    [[I_028:%.*]] = phi i32 [ [[INC11:%.*]], [[FOR_COND_CLEANUP3:%.*]] ], [ 0, [[FOR_BODY4_LR_PH_PREHEADER]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[I_028]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[I_028]], [[N]]
+; CHECK-NEXT:    br label [[FOR_BODY4:%.*]]
+; CHECK:       for.body4:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ 0, [[FOR_BODY4_LR_PH]] ], [ [[ADD9:%.*]], [[FOR_BODY4]] ]
+; CHECK-NEXT:    [[J_026:%.*]] = phi i32 [ 0, [[FOR_BODY4_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY4]] ]
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[J_026]], [[MUL]]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[ADD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[J_026]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[MUL7:%.*]] = mul nsw i32 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[ADD9]] = add nsw i32 [[MUL7]], [[TMP0]]
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[J_026]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4]]
+; CHECK:       for.cond.cleanup3:
+; CHECK-NEXT:    store i32 [[ADD9]], i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[INC11]] = add nuw nsw i32 [[I_028]], 1
+; CHECK-NEXT:    [[EXITCOND29:%.*]] = icmp eq i32 [[INC11]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND29]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY4_LR_PH]]
+;
+entry:
+  %cmp27 = icmp sgt i32 %N, 0
+  br i1 %cmp27, label %for.body4.lr.ph.preheader, label %for.cond.cleanup
+
+for.body4.lr.ph.preheader:                        ; preds = %entry
+  br label %for.body4.lr.ph
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup3, %entry
+  ret void
+
+for.body4.lr.ph:                                  ; preds = %for.body4.lr.ph.preheader, %for.cond.cleanup3
+  %i.028 = phi i32 [ %inc11, %for.cond.cleanup3 ], [ 0, %for.body4.lr.ph.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i32 %i.028
+  store i32 0, i32* %arrayidx, align 4
+  %mul = mul nsw i32 %i.028, %N
+  br label %for.body4
+
+for.body4:                                        ; preds = %for.body4, %for.body4.lr.ph
+  %0 = phi i32 [ 0, %for.body4.lr.ph ], [ %add9, %for.body4 ]
+  %j.026 = phi i32 [ 0, %for.body4.lr.ph ], [ %inc, %for.body4 ]
+  %add = add nsw i32 %j.026, %mul
+  %arrayidx5 = getelementptr inbounds i32, i32* %A, i32 %add
+  %1 = load i32, i32* %arrayidx5, align 4
+  %arrayidx6 = getelementptr inbounds i32, i32* %x, i32 %j.026
+  %2 = load i32, i32* %arrayidx6, align 4
+  %mul7 = mul nsw i32 %2, %1
+  %add9 = add nsw i32 %mul7, %0
+  %inc = add nuw nsw i32 %j.026, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.cond.cleanup3, label %for.body4
+
+for.cond.cleanup3:                                ; preds = %for.body4
+  store i32 %add9, i32* %arrayidx, align 4
+  %inc11 = add nuw nsw i32 %i.028, 1
+  %exitcond29 = icmp eq i32 %inc11, %N
+  br i1 %exitcond29, label %for.cond.cleanup, label %for.body4.lr.ph
+}
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-malloc-free.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-malloc-free.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-malloc-free.ll
@@ -0,0 +1,291 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+
+; XFAIL: *
+; TODO: Handling of free not implemented yet.
+
+; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+declare void @unknown_func()
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+declare noalias i8* @malloc(i32)
+declare void @free(i8* nocapture)
+
+define void @test16(i32* noalias %P) {
+; CHECK-LABEL: @test16(
+; CHECK-NEXT:    [[P2:%.*]] = bitcast i32* [[P:%.*]] to i8*
+; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB3:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    call void @free(i8* [[P2]])
+; CHECK-NEXT:    ret void
+;
+  %P2 = bitcast i32* %P to i8*
+  store i32 1, i32* %P
+  br i1 true, label %bb1, label %bb3
+bb1:
+  store i32 1, i32* %P
+  br label %bb3
+bb3:
+  call void @free(i8* %P2)
+  ret void
+}
+
+
+define void @test17(i32* noalias %P) {
+; CHECK-LABEL: @test17(
+; CHECK-NEXT:    [[P2:%.*]] = bitcast i32* [[P:%.*]] to i8*
+; CHECK-NEXT:    store i32 1, i32* [[P]]
+; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB3:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    call void @unknown_func()
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    call void @free(i8* [[P2]])
+; CHECK-NEXT:    ret void
+;
+  %P2 = bitcast i32* %P to i8*
+  store i32 1, i32* %P
+  br i1 true, label %bb1, label %bb3
+bb1:
+  call void @unknown_func()
+  store i32 1, i32* %P
+  br label %bb3
+bb3:
+  call void @free(i8* %P2)
+  ret void
+}
+
+define void @test6(i32* noalias %P) {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:    store i32 0, i32* [[P:%.*]]
+; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    call void @unknown_func()
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    store i32 0, i32* [[P]]
+; CHECK-NEXT:    ret void
+;
+  store i32 0, i32* %P
+  br i1 true, label %bb1, label %bb2
+bb1:
+  br label %bb3
+bb2:
+  call void @unknown_func()
+  br label %bb3
+bb3:
+  store i32 0, i32* %P
+  ret void
+}
+
+define void @test19(i32* noalias %P) {
+; CHECK-LABEL: @test19(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i1 false)
+; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
+; CHECK-NEXT:    store i32 1, i32* [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+  %p3 = bitcast i32* %arrayidx0 to i8*
+  call void @llvm.memset.p0i8.i64(i8* %p3, i8 0, i64 28, i32 4, i1 false)
+  br i1 true, label %bb1, label %bb2
+bb1:
+  br label %bb3
+bb2:
+  %arrayidx1 = getelementptr inbounds i32, i32* %P, i64 1
+  store i32 1, i32* %arrayidx1, align 4
+  br label %bb3
+bb3:
+  ret void
+}
+
+
+define void @test20(i32* noalias %P) {
+; CHECK-LABEL: @test20(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8*
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i1 false)
+; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
+; CHECK-NEXT:    store i32 1, i32* [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+  %p3 = bitcast i32* %arrayidx0 to i8*
+  call void @llvm.memset.p0i8.i64(i8* %p3, i8 0, i64 28, i32 4, i1 false)
+  br i1 true, label %bb1, label %bb2
+bb1:
+  br label %bb3
+bb2:
+  br label %bb3
+bb3:
+  %arrayidx1 = getelementptr inbounds i32, i32* %P, i64 1
+  store i32 1, i32* %arrayidx1, align 4
+  ret void
+}
+
+
+define i32 @test22(i32* %P, i32* noalias %Q, i32* %R) {
+; CHECK-LABEL: @test22(
+; CHECK-NEXT:    store i32 2, i32* [[P:%.*]]
+; CHECK-NEXT:    store i32 3, i32* [[Q:%.*]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, i32* [[R:%.*]]
+; CHECK-NEXT:    ret i32 [[L]]
+;
+  store i32 1, i32* %Q
+  store i32 2, i32* %P
+  store i32 3, i32* %Q
+  %l = load i32, i32* %R
+  ret i32 %l
+}
+
+
+define void @test23(i32* noalias %P) {
+; CHECK-LABEL: @test23(
+; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    call void @unknown_func()
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    store i32 0, i32* [[P:%.*]]
+; CHECK-NEXT:    ret void
+;
+  br i1 true, label %bb1, label %bb2
+bb1:
+  store i32 0, i32* %P
+  br label %bb3
+bb2:
+  call void @unknown_func()
+  br label %bb3
+bb3:
+  store i32 0, i32* %P
+  ret void
+}
+
+
+define void @test24(i32* noalias %P) {
+; CHECK-LABEL: @test24(
+; CHECK-NEXT:    br i1 true, label [[BB2:%.*]], label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    call void @unknown_func()
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    store i32 0, i32* [[P:%.*]]
+; CHECK-NEXT:    ret void
+;
+  br i1 true, label %bb2, label %bb1
+bb1:
+  store i32 0, i32* %P
+  br label %bb3
+bb2:
+  call void @unknown_func()
+  br label %bb3
+bb3:
+  store i32 0, i32* %P
+  ret void
+}
+
+define i8* @test26() {
+; CHECK-LABEL: @test26(
+; CHECK-NEXT:  bb1:
+; CHECK-NEXT:    br i1 true, label [[BB2:%.*]], label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[M:%.*]] = call noalias i8* @malloc(i32 10)
+; CHECK-NEXT:    store i8 1, i8* [[M]]
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[R:%.*]] = phi i8* [ null, [[BB1:%.*]] ], [ [[M]], [[BB2]] ]
+; CHECK-NEXT:    ret i8* [[R]]
+;
+bb1:
+  br i1 true, label %bb2, label %bb3
+bb2:
+  %m = call noalias i8* @malloc(i32 10)
+  store i8 1, i8* %m
+  br label %bb3
+bb3:
+  %r = phi i8* [ null, %bb1 ], [ %m, %bb2 ]
+  ret i8* %r
+}
+
+
+define void @test27() {
+; CHECK-LABEL: @test27(
+; CHECK-NEXT:  bb1:
+; CHECK-NEXT:    br i1 true, label [[BB2:%.*]], label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[M:%.*]] = call noalias i8* @malloc(i32 10)
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[R:%.*]] = phi i8* [ null, [[BB1:%.*]] ], [ [[M]], [[BB2]] ]
+; CHECK-NEXT:    ret void
+;
+bb1:
+  br i1 true, label %bb2, label %bb3
+bb2:
+  %m = call noalias i8* @malloc(i32 10)
+  store i8 1, i8* %m
+  br label %bb3
+bb3:
+  %r = phi i8* [ null, %bb1 ], [ %m, %bb2 ]
+  ret void
+}
+
+
+define i8* @test28() {
+; CHECK-LABEL: @test28(
+; CHECK-NEXT:  bb0:
+; CHECK-NEXT:    [[M:%.*]] = call noalias i8* @malloc(i32 10)
+; CHECK-NEXT:    [[MC0:%.*]] = bitcast i8* [[M]] to i8*
+; CHECK-NEXT:    [[MC1:%.*]] = bitcast i8* [[MC0]] to i8*
+; CHECK-NEXT:    [[MC2:%.*]] = bitcast i8* [[MC1]] to i8*
+; CHECK-NEXT:    [[MC3:%.*]] = bitcast i8* [[MC2]] to i8*
+; CHECK-NEXT:    [[MC4:%.*]] = bitcast i8* [[MC3]] to i8*
+; CHECK-NEXT:    [[MC5:%.*]] = bitcast i8* [[MC4]] to i8*
+; CHECK-NEXT:    [[MC6:%.*]] = bitcast i8* [[MC5]] to i8*
+; CHECK-NEXT:    [[M0:%.*]] = bitcast i8* [[MC6]] to i8*
+; CHECK-NEXT:    store i8 2, i8* [[M]]
+; CHECK-NEXT:    ret i8* [[M0]]
+;
+bb0:
+  %m = call noalias i8* @malloc(i32 10)
+  %mc0 = bitcast i8* %m to i8*
+  %mc1 = bitcast i8* %mc0 to i8*
+  %mc2 = bitcast i8* %mc1 to i8*
+  %mc3 = bitcast i8* %mc2 to i8*
+  %mc4 = bitcast i8* %mc3 to i8*
+  %mc5 = bitcast i8* %mc4 to i8*
+  %mc6 = bitcast i8* %mc5 to i8*
+  %m0 = bitcast i8* %mc6 to i8*
+  store i8 2, i8* %m
+  ret i8* %m0
+}
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memintrinsics.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memintrinsics.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memintrinsics.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+declare void @unknown_func()
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+
+define void @test19(i32* noalias %P) {
+; CHECK-LABEL: @test19(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i1 false)
+; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
+; CHECK-NEXT:    store i32 1, i32* [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+  %p3 = bitcast i32* %arrayidx0 to i8*
+  call void @llvm.memset.p0i8.i64(i8* %p3, i8 0, i64 28, i32 4, i1 false)
+  br i1 true, label %bb1, label %bb2
+bb1:
+  br label %bb3
+bb2:
+  %arrayidx1 = getelementptr inbounds i32, i32* %P, i64 1
+  store i32 1, i32* %arrayidx1, align 4
+  br label %bb3
+bb3:
+  ret void
+}
+
+
+define void @test20(i32* noalias %P) {
+; CHECK-LABEL: @test20(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i1 false)
+; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
+; CHECK-NEXT:    store i32 1, i32* [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+  %p3 = bitcast i32* %arrayidx0 to i8*
+  call void @llvm.memset.p0i8.i64(i8* %p3, i8 0, i64 28, i32 4, i1 false)
+  br i1 true, label %bb1, label %bb2
+bb1:
+  br label %bb3
+bb2:
+  br label %bb3
+bb3:
+  %arrayidx1 = getelementptr inbounds i32, i32* %P, i64 1
+  store i32 1, i32* %arrayidx1, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memoryphis.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memoryphis.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memoryphis.ll
@@ -0,0 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+
+define void @test4(i32* noalias %P) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    store i32 0, i32* [[P:%.*]]
+; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[X:%.*]] = load i32, i32* [[P]]
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    store i32 0, i32* [[P]]
+; CHECK-NEXT:    ret void
+;
+  store i32 0, i32* %P
+  br i1 true, label %bb1, label %bb2
+bb1:
+  br label %bb3
+bb2:
+  %x = load i32, i32* %P
+  br label %bb3
+bb3:
+  store i32 0, i32* %P
+  ret void
+}
+
+define void @test5(i32* noalias %P) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    store i32 1, i32* [[P:%.*]]
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    store i32 1, i32* [[P]]
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    store i32 0, i32* [[P]]
+; CHECK-NEXT:    ret void
+;
+  br i1 true, label %bb1, label %bb2
+bb1:
+  store i32 1, i32* %P
+  br label %bb3
+bb2:
+  store i32 1, i32* %P
+  br label %bb3
+bb3:
+  store i32 0, i32* %P
+  ret void
+}
+
+define void @test8(i32* %P, i32* %Q) {
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    store i32 1, i32* [[P:%.*]]
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    store i32 1, i32* [[Q:%.*]]
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    store i32 0, i32* [[P]]
+; CHECK-NEXT:    ret void
+;
+  br i1 true, label %bb1, label %bb2
+bb1:
+  store i32 1, i32* %P
+  br label %bb3
+bb2:
+  store i32 1, i32* %Q
+  br label %bb3
+bb3:
+  store i32 0, i32* %P
+  ret void
+}
+
+define void @test10(i32* noalias %P) {
+; CHECK-LABEL: @test10(
+; CHECK-NEXT:    [[P2:%.*]] = bitcast i32* [[P:%.*]] to i8*
+; CHECK-NEXT:    store i32 0, i32* [[P]]
+; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    store i8 1, i8* [[P2]]
+; CHECK-NEXT:    ret void
+;
+  %P2 = bitcast i32* %P to i8*
+  store i32 0, i32* %P
+  br i1 true, label %bb1, label %bb2
+bb1:
+  br label %bb3
+bb2:
+  br label %bb3
+bb3:
+  store i8 1, i8* %P2
+  ret void
+}
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-partial.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-partial.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-partial.ll
@@ -0,0 +1,56 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+
+define void @second_store_smaller(i32* noalias %P) {
+; CHECK-LABEL: @second_store_smaller(
+; CHECK-NEXT:    store i32 1, i32* [[P:%.*]]
+; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[P_I16:%.*]] = bitcast i32* [[P]] to i16*
+; CHECK-NEXT:    store i16 0, i16* [[P_I16]]
+; CHECK-NEXT:    ret void
+;
+  store i32 1, i32* %P
+  br i1 true, label %bb1, label %bb2
+bb1:
+  br label %bb3
+bb2:
+  br label %bb3
+bb3:
+  %P.i16 = bitcast i32* %P to i16*
+  store i16 0, i16* %P.i16
+  ret void
+}
+
+
+define void @second_store_bigger(i32* noalias %P) {
+; CHECK-LABEL: @second_store_bigger(
+; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[P_I64:%.*]] = bitcast i32* [[P:%.*]] to i64*
+; CHECK-NEXT:    store i64 0, i64* [[P_I64]]
+; CHECK-NEXT:    ret void
+;
+  store i32 1, i32* %P
+  br i1 true, label %bb1, label %bb2
+bb1:
+  br label %bb3
+bb2:
+  br label %bb3
+bb3:
+  %P.i64 = bitcast i32* %P to i64*
+  store i64 0, i64* %P.i64
+  ret void
+}
+
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+
+define void @test2(i32* noalias %P) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    store i32 0, i32* [[P:%.*]]
+; CHECK-NEXT:    ret void
+;
+  store i32 1, i32* %P
+  br i1 true, label %bb1, label %bb2
+bb1:
+  br label %bb3
+bb2:
+  br label %bb3
+bb3:
+  store i32 0, i32* %P
+  ret void
+}
+
+define void @test3(i32* noalias %P) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    store i32 0, i32* [[P:%.*]]
+; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    store i32 0, i32* [[P]]
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    ret void
+;
+  store i32 0, i32* %P
+  br i1 true, label %bb1, label %bb2
+bb1:
+  br label %bb3
+bb2:
+  store i32 0, i32* %P
+  br label %bb3
+bb3:
+  ret void
+}
+
+
+define void @test7(i32* noalias %P, i32* noalias %Q) {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[P:%.*]]
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    store i32 0, i32* [[Q:%.*]]
+; CHECK-NEXT:    store i32 0, i32* [[P]]
+; CHECK-NEXT:    ret void
+;
+  store i32 1, i32* %Q
+  br i1 true, label %bb1, label %bb2
+bb1:
+  load i32, i32* %P
+  br label %bb3
+bb2:
+  br label %bb3
+bb3:
+  store i32 0, i32* %Q
+  store i32 0, i32* %P
+  ret void
+}
+
+define i32 @test22(i32* %P, i32* noalias %Q, i32* %R) {
+; CHECK-LABEL: @test22(
+; CHECK-NEXT:    store i32 2, i32* [[P:%.*]]
+; CHECK-NEXT:    store i32 3, i32* [[Q:%.*]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, i32* [[R:%.*]]
+; CHECK-NEXT:    ret i32 [[L]]
+;
+  store i32 1, i32* %Q
+  store i32 2, i32* %P
+  store i32 3, i32* %Q
+  %l = load i32, i32* %R
+  ret i32 %l
+}
+
+define void @test9(i32* noalias %P) {
+; CHECK-LABEL: @test9(
+; CHECK-NEXT:    store i32 0, i32* [[P:%.*]]
+; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    ret void
+; CHECK:       bb3:
+; CHECK-NEXT:    store i32 0, i32* [[P]]
+; CHECK-NEXT:    ret void
+;
+  store i32 0, i32* %P
+  br i1 true, label %bb1, label %bb2
+bb1:
+  br label %bb3
+bb2:
+  ret void
+bb3:
+  store i32 0, i32* %P
+  ret void
+}
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-throwing.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-throwing.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-throwing.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; XFAIL: *
+; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+declare void @unknown_func()
+
+define void @test6(i32* noalias %P) {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:    store i32 0, i32* [[P:%.*]]
+; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    call void @unknown_func()
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    store i32 0, i32* [[P]]
+; CHECK-NEXT:    ret void
+;
+  store i32 0, i32* %P
+  br i1 true, label %bb1, label %bb2
+bb1:
+  br label %bb3
+bb2:
+  call void @unknown_func()
+  br label %bb3
+bb3:
+  store i32 0, i32* %P
+  ret void
+}
+
+define i32 @test22(i32* %P, i32* noalias %Q, i32* %R) {
+; CHECK-LABEL: @test22(
+; CHECK-NEXT:    store i32 2, i32* [[P:%.*]]
+; CHECK-NEXT:    store i32 3, i32* [[Q:%.*]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, i32* [[R:%.*]]
+; CHECK-NEXT:    ret i32 [[L]]
+;
+  store i32 1, i32* %Q
+  store i32 2, i32* %P
+  store i32 3, i32* %Q
+  %l = load i32, i32* %R
+  ret i32 %l
+}
+
+
+define void @test23(i32* noalias %P) {
+; CHECK-LABEL: @test23(
+; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    call void @unknown_func()
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    store i32 0, i32* [[P:%.*]]
+; CHECK-NEXT:    ret void
+;
+  br i1 true, label %bb1, label %bb2
+bb1:
+  store i32 0, i32* %P
+  br label %bb3
+bb2:
+  call void @unknown_func()
+  br label %bb3
+bb3:
+  store i32 0, i32* %P
+  ret void
+}
+
+
+define void @test24(i32* noalias %P) {
+; CHECK-LABEL: @test24(
+; CHECK-NEXT:    br i1 true, label [[BB2:%.*]], label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    call void @unknown_func()
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    store i32 0, i32* [[P:%.*]]
+; CHECK-NEXT:    ret void
+;
+  br i1 true, label %bb2, label %bb1
+bb1:
+  store i32 0, i32* %P
+  br label %bb3
+bb2:
+  call void @unknown_func()
+  br label %bb3
+bb3:
+  store i32 0, i32* %P
+  ret void
+}
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll
@@ -68,185 +68,6 @@
 declare noalias i8* @malloc(i32)
 declare noalias i8* @calloc(i32, i32)
 
-
-; PR8701
-
-;; Fully dead overwrite of memcpy.
-define void @test15(i8* %P, i8* %Q) nounwind ssp {
-; CHECK-LABEL: @test15(
-; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P:%.*]], i8* [[Q:%.*]], i64 12, i1 false)
-; CHECK-NEXT:    ret void
-;
-  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false)
-  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false)
-  ret void
-}
-
-;; Fully dead overwrite of memcpy.
-define void @test15_atomic(i8* %P, i8* %Q) nounwind ssp {
-; CHECK-LABEL: @test15_atomic(
-; CHECK-NEXT:    tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i32 1)
-; CHECK-NEXT:    ret void
-;
-  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1)
-  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1)
-  ret void
-}
-
-;; Fully dead overwrite of memcpy.
-define void @test15_atomic_weaker(i8* %P, i8* %Q) nounwind ssp {
-; CHECK-LABEL: @test15_atomic_weaker(
-; CHECK-NEXT:    tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i32 1)
-; CHECK-NEXT:    ret void
-;
-  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i1 false)
-  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1)
-  ret void
-}
-
-;; Fully dead overwrite of memcpy.
-define void @test15_atomic_weaker_2(i8* %P, i8* %Q) nounwind ssp {
-; CHECK-LABEL: @test15_atomic_weaker_2(
-; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i1 false)
-; CHECK-NEXT:    ret void
-;
-  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1)
-  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i1 false)
-  ret void
-}
-
-;; Full overwrite of smaller memcpy.
-define void @test16(i8* %P, i8* %Q) nounwind ssp {
-; CHECK-LABEL: @test16(
-; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P:%.*]], i8* [[Q:%.*]], i64 12, i1 false)
-; CHECK-NEXT:    ret void
-;
-  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 8, i1 false)
-  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false)
-  ret void
-}
-
-;; Full overwrite of smaller memcpy.
-define void @test16_atomic(i8* %P, i8* %Q) nounwind ssp {
-; CHECK-LABEL: @test16_atomic(
-; CHECK-NEXT:    tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i32 1)
-; CHECK-NEXT:    ret void
-;
-  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 8, i32 1)
-  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1)
-  ret void
-}
-
-;; Full overwrite of smaller memory where overwrite has stronger atomicity
-define void @test16_atomic_weaker(i8* %P, i8* %Q) nounwind ssp {
-; CHECK-LABEL: @test16_atomic_weaker(
-; CHECK-NEXT:    tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i32 1)
-; CHECK-NEXT:    ret void
-;
-  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 8, i1 false)
-  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1)
-  ret void
-}
-
-;; Full overwrite of smaller memory where overwrite has weaker atomicity.
-define void @test16_atomic_weaker_2(i8* %P, i8* %Q) nounwind ssp {
-; CHECK-LABEL: @test16_atomic_weaker_2(
-; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i1 false)
-; CHECK-NEXT:    ret void
-;
-  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 8, i32 1)
-  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i1 false)
-  ret void
-}
-
-;; Overwrite of memset by memcpy.
-define void @test17(i8* %P, i8* noalias %Q) nounwind ssp {
-; CHECK-LABEL: @test17(
-; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P:%.*]], i8* [[Q:%.*]], i64 12, i1 false)
-; CHECK-NEXT:    ret void
-;
-  tail call void @llvm.memset.p0i8.i64(i8* %P, i8 42, i64 8, i1 false)
-  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false)
-  ret void
-}
-
-;; Overwrite of memset by memcpy.
-define void @test17_atomic(i8* %P, i8* noalias %Q) nounwind ssp {
-; CHECK-LABEL: @test17_atomic(
-; CHECK-NEXT:    tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i32 1)
-; CHECK-NEXT:    ret void
-;
-  tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %P, i8 42, i64 8, i32 1)
-  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1)
-  ret void
-}
-
-;; Overwrite of memset by memcpy. Overwrite is stronger atomicity. We can
-;; remove the memset.
-define void @test17_atomic_weaker(i8* %P, i8* noalias %Q) nounwind ssp {
-; CHECK-LABEL: @test17_atomic_weaker(
-; CHECK-NEXT:    tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i32 1)
-; CHECK-NEXT:    ret void
-;
-  tail call void @llvm.memset.p0i8.i64(i8* align 1 %P, i8 42, i64 8, i1 false)
-  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1)
-  ret void
-}
-
-;; Overwrite of memset by memcpy. Overwrite is weaker atomicity. We can remove
-;; the memset.
-define void @test17_atomic_weaker_2(i8* %P, i8* noalias %Q) nounwind ssp {
-; CHECK-LABEL: @test17_atomic_weaker_2(
-; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i1 false)
-; CHECK-NEXT:    ret void
-;
-  tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %P, i8 42, i64 8, i32 1)
-  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i1 false)
-  ret void
-}
-
-; Should not delete the volatile memset.
-define void @test17v(i8* %P, i8* %Q) nounwind ssp {
-; CHECK-LABEL: @test17v(
-; CHECK-NEXT:    tail call void @llvm.memset.p0i8.i64(i8* [[P:%.*]], i8 42, i64 8, i1 true)
-; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P]], i8* [[Q:%.*]], i64 12, i1 false)
-; CHECK-NEXT:    ret void
-;
-  tail call void @llvm.memset.p0i8.i64(i8* %P, i8 42, i64 8, i1 true)
-  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false)
-  ret void
-}
-
-; PR8728
-; Do not delete instruction where possible situation is:
-; A = B
-; A = A
-;
-; NB! See PR11763 - currently LLVM allows memcpy's source and destination to be
-; equal (but not inequal and overlapping).
-define void @test18(i8* %P, i8* %Q, i8* %R) nounwind ssp {
-; CHECK-LABEL: @test18(
-; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P:%.*]], i8* [[Q:%.*]], i64 12, i1 false)
-; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P]], i8* [[R:%.*]], i64 12, i1 false)
-; CHECK-NEXT:    ret void
-;
-  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false)
-  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i1 false)
-  ret void
-}
-
-define void @test18_atomic(i8* %P, i8* %Q, i8* %R) nounwind ssp {
-; CHECK-LABEL: @test18_atomic(
-; CHECK-NEXT:    tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i32 1)
-; CHECK-NEXT:    tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P]], i8* align 1 [[R:%.*]], i64 12, i32 1)
-; CHECK-NEXT:    ret void
-;
-  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1)
-  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %R, i64 12, i32 1)
-  ret void
-}
-
-
 ; The store here is not dead because the byval call reads it.
 declare void @test19f({i32}* byval align 4 %P)
 
@@ -397,7 +218,7 @@
 ; it could unwind
 define void @test34(i32* noalias %p) {
 ; CHECK-LABEL: @test34(
-; CHECK-NEXT:    store i32 1, i32* [[P]]
+; CHECK-NEXT:    store i32 1, i32* [[P:%.*]]
 ; CHECK-NEXT:    call void @unknown_func()
 ; CHECK-NEXT:    store i32 0, i32* [[P]]
 ; CHECK-NEXT:    ret void
@@ -408,12 +229,10 @@
   ret void
 }
 
-; TODO
 ; Remove redundant store even with an unwinding function in the same block
 define void @test35(i32* noalias %p) {
 ; CHECK-LABEL: @test35(
 ; CHECK-NEXT:    call void @unknown_func()
-; CHECK-NEXT:    store i32 1, i32* [[P:%.*]]
 ; CHECK-NEXT:    store i32 0, i32* [[P:%.*]]
 ; CHECK-NEXT:    ret void
 ;
@@ -472,11 +291,10 @@
   ret void
 }
 
-; Same caveat about memcpy as in @test18 applies here.
+; The memmove is dead, because memcpy arguments cannot overlap.
 define void @test38(i8* %P, i8* %Q, i8* %R) {
 ; CHECK-LABEL: @test38(
-; CHECK-NEXT:    tail call void @llvm.memmove.p0i8.p0i8.i64(i8* [[P:%.*]], i8* [[Q:%.*]], i64 12, i1 false)
-; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P]], i8* [[R:%.*]], i64 12, i1 false)
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P:%.*]], i8* [[R:%.*]], i64 12, i1 false)
 ; CHECK-NEXT:    ret void
 ;
 
@@ -485,10 +303,10 @@
   ret void
 }
 
+; The memmove is dead, because memcpy arguments cannot overlap.
 define void @test38_atomic(i8* %P, i8* %Q, i8* %R) {
 ; CHECK-LABEL: @test38_atomic(
-; CHECK-NEXT:    tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i32 1)
-; CHECK-NEXT:    tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P]], i8* align 1 [[R:%.*]], i64 12, i32 1)
+; CHECK-NEXT:    tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[R:%.*]], i64 12, i32 1)
 ; CHECK-NEXT:    ret void
 ;
 
@@ -557,12 +375,13 @@
 ; throwing should leave store i32 1, not remove from the free.
 declare void @free(i8* nocapture)
 define void @test41(i32* noalias %P) {
-; NOCHECK-LABEL: @test41(
-; NOCHECK-NEXT:    [[P2:%.*]] = bitcast i32* [[P:%.*]] to i8*
-; NOCHECK-NEXT:    store i32 1, i32* [[P]]
-; NOCHECK-NEXT:    call void @unknown_func()
-; NOCHECK-NEXT:    call void @free(i8* [[P2]])
-; NOCHECK-NEXT:    ret void
+; CHECK-LABEL: @test41(
+; CHECK-NEXT:    [[P2:%.*]] = bitcast i32* [[P:%.*]] to i8*
+; CHECK-NEXT:    store i32 1, i32* [[P]]
+; CHECK-NEXT:    call void @unknown_func()
+; CHECK-NEXT:    store i32 2, i32* [[P]]
+; CHECK-NEXT:    call void @free(i8* [[P2]])
+; CHECK-NEXT:    ret void
 ;
   %P2 = bitcast i32* %P to i8*
   store i32 1, i32* %P
@@ -573,12 +392,12 @@
 }
 
 define void @test42(i32* %P, i32* %Q) {
-; NOCHECK-LABEL: @test42(
-; NOCHECK-NEXT:    store i32 1, i32* [[P:%.*]]
-; NOCHECK-NEXT:    [[P2:%.*]] = bitcast i32* [[P]] to i8*
-; NOCHECK-NEXT:    store i32 2, i32* [[Q:%.*]]
-; NOCHECK-NEXT:    store i8 3, i8* [[P2]]
-; NOCHECK-NEXT:    ret void
+; CHECK-LABEL: @test42(
+; CHECK-NEXT:    store i32 1, i32* [[P:%.*]]
+; CHECK-NEXT:    [[P2:%.*]] = bitcast i32* [[P]] to i8*
+; CHECK-NEXT:    store i32 2, i32* [[Q:%.*]]
+; CHECK-NEXT:    store i8 3, i8* [[P2]]
+; CHECK-NEXT:    ret void
 ;
   store i32 1, i32* %P
   %P2 = bitcast i32* %P to i8*
@@ -588,12 +407,12 @@
 }
 
 define void @test42a(i32* %P, i32* %Q) {
-; NOCHECK-LABEL: @test42a(
-; NOCHECK-NEXT:    store atomic i32 1, i32* [[P:%.*]] unordered, align 4
-; NOCHECK-NEXT:    [[P2:%.*]] = bitcast i32* [[P]] to i8*
-; NOCHECK-NEXT:    store atomic i32 2, i32* [[Q:%.*]] unordered, align 4
-; NOCHECK-NEXT:    store atomic i8 3, i8* [[P2]] unordered, align 4
-; NOCHECK-NEXT:    ret void
+; CHECK-LABEL: @test42a(
+; CHECK-NEXT:    store atomic i32 1, i32* [[P:%.*]] unordered, align 4
+; CHECK-NEXT:    [[P2:%.*]] = bitcast i32* [[P]] to i8*
+; CHECK-NEXT:    store atomic i32 2, i32* [[Q:%.*]] unordered, align 4
+; CHECK-NEXT:    store atomic i8 3, i8* [[P2]] unordered, align 4
+; CHECK-NEXT:    ret void
 ;
   store atomic i32 1, i32* %P unordered, align 4
   %P2 = bitcast i32* %P to i8*