diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -29,7 +29,10 @@
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/OrderedBasicBlock.h"
+#include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
@@ -40,6 +43,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -87,11 +91,20 @@
   cl::init(true), cl::Hidden,
   cl::desc("Enable partial store merging in DSE"));
 
-// Temporary dummy option for tests.
 static cl::opt<bool>
     EnableMemorySSA("enable-dse-memoryssa", cl::init(false), cl::Hidden,
                     cl::desc("Use the new MemorySSA-backed DSE."));
 
+static cl::opt<unsigned>
+    MemorySSAScanLimit("dse-memoryssa-scanlimit", cl::init(100), cl::Hidden,
+                       cl::desc("The number of memory instructions to scan for "
+                                "dead store elimination (default = 100)"));
+
+static cl::opt<unsigned> MemorySSADefsPerBlockLimit(
+    "dse-memoryssa-defs-per-block-limit", cl::init(5000), cl::Hidden,
+    cl::desc("The number of MemoryDefs we consider as candidates to eliminated "
+             "other stores per basic block (default = 5000)"));
+
 //===----------------------------------------------------------------------===//
 // Helper functions
 //===----------------------------------------------------------------------===//
@@ -1354,22 +1367,479 @@
   return MadeChange;
 }
 
+namespace {
+//=============================================================================
+// MemorySSA backed dead store elimination.
+//
+// The code below implements dead store elimination using MemorySSA. It uses
+// the following general approach: given a MemoryDef, walk upwards to find
+// clobbering MemoryDefs that may be killed by the starting def. Then check
+// that there are no uses that may read the location of the original MemoryDef
+// in between both MemoryDefs. A bit more concretely:
+//
+// For all MemoryDefs StartDef:
+// 1. Get the next dominating clobbering MemoryDef (DomAccess) by walking
+//    upwards.
+// 2. Check that there no reads between DomAccess and the StartDef by checking
+//    all uses starting at DomAccess and walking until we see StartDef.
+// 3. For each found DomDef, check that:
+//   1. There are no barrier instructions between DomDef and StartDef (like
+//       throws or stores with ordering constraints).
+//   2. StartDef is executed whenever DomDef is executed.
+//   3. StartDef completely overwrites DomDef.
+// 4. Erase DomDef from the function and MemorySSA.
+
+// Returns true if \p M is an intrisnic that does not read or write memory.
+bool isNoopIntrinsic(MemoryUseOrDef *M) {
+  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(M->getMemoryInst())) {
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::lifetime_start:
+    case Intrinsic::lifetime_end:
+    case Intrinsic::invariant_end:
+    case Intrinsic::launder_invariant_group:
+    case Intrinsic::assume:
+      return true;
+    case Intrinsic::dbg_addr:
+    case Intrinsic::dbg_declare:
+    case Intrinsic::dbg_label:
+    case Intrinsic::dbg_value:
+      llvm_unreachable("Intrinsic should not be modeled in MemorySSA");
+    default:
+      return false;
+    }
+  }
+  return false;
+}
+
+// Check if we can ignore \p D for DSE.
+bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) {
+  Instruction *DI = D->getMemoryInst();
+  // Calls that only access inaccessible memory cannot read or write any memory
+  // locations we consider for elimination.
+  if (auto CS = CallSite(DI))
+    if (CS.onlyAccessesInaccessibleMemory())
+      return true;
+
+  // We can eliminate stores to locations not visible to the caller across
+  // throwing instructions.
+  if (DI->mayThrow() && !DefVisibleToCaller)
+    return true;
+
+  // We can remove the dead stores, irrespective of the fence and its ordering
+  // (release/acquire/seq_cst). Fences only constraints the ordering of
+  // already visible stores, it does not make a store visible to other
+  // threads. So, skipping over a fence does not change a store from being
+  // dead.
+  if (isa<FenceInst>(DI))
+    return true;
+
+  // Skip intrinsics that do not really read or modify memory.
+  if (isNoopIntrinsic(D))
+    return true;
+
+  return false;
+}
+
+struct DSEState {
+  Function &F;
+  AliasAnalysis &AA;
+  MemorySSA &MSSA;
+  DominatorTree &DT;
+  PostDominatorTree &PDT;
+  const TargetLibraryInfo &TLI;
+
+  // All MemoryDefs that potentially could kill other MemDefs.
+  SmallVector<MemoryDef *, 64> MemDefs;
+  // Any that should be skipped as they are already deleted
+  SmallPtrSet<MemoryAccess *, 4> SkipStores;
+  // Keep track of all of the objects that are invisible to the caller until the
+  // function returns.
+  SmallPtrSet<const Value *, 16> InvisibleToCaller;
+  // Keep track of blocks with throwing instructions not modeled in MemorySSA.
+  SmallPtrSet<BasicBlock *, 16> ThrowingBlocks;
+
+  DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT,
+           PostDominatorTree &PDT, const TargetLibraryInfo &TLI)
+      : F(F), AA(AA), MSSA(MSSA), DT(DT), PDT(PDT), TLI(TLI) {}
+
+  static DSEState get(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
+                      DominatorTree &DT, PostDominatorTree &PDT,
+                      const TargetLibraryInfo &TLI) {
+    DSEState State(F, AA, MSSA, DT, PDT, TLI);
+    // Collect blocks with throwing instructions not modeled in MemorySSA and
+    // alloc-like objects.
+    for (Instruction &I : instructions(F)) {
+      if (I.mayThrow() && !MSSA.getMemoryAccess(&I))
+        State.ThrowingBlocks.insert(I.getParent());
+
+      auto *MD = dyn_cast_or_null<MemoryDef>(MSSA.getMemoryAccess(&I));
+      if (MD && State.MemDefs.size() < MemorySSADefsPerBlockLimit &&
+          hasAnalyzableMemoryWrite(&I, TLI) && isRemovable(&I))
+        State.MemDefs.push_back(MD);
+
+      // Track alloca and alloca-like objects. Here we care about objects not
+      // visible to the caller during function execution. Alloca objects are
+      // invalid in the caller, for alloca-like objects we ensure that they are
+      // not captured throughout the function.
+      if (isa<AllocaInst>(&I) ||
+          (isAllocLikeFn(&I, &TLI) && !PointerMayBeCaptured(&I, false, true)))
+        State.InvisibleToCaller.insert(&I);
+    }
+    // Treat byval or inalloca arguments the same as Allocas, stores to them are
+    // dead at the end of the function.
+    for (Argument &AI : F.args())
+      if (AI.hasByValOrInAllocaAttr())
+        State.InvisibleToCaller.insert(&AI);
+    return State;
+  }
+
+  Optional<MemoryLocation> getLocForWriteEx(Instruction *I) const {
+    if (!I->mayWriteToMemory())
+      return None;
+
+    if (auto *MTI = dyn_cast<AnyMemIntrinsic>(I))
+      return {MemoryLocation::getForDest(MTI)};
+
+    if (auto CS = CallSite(I)) {
+      if (Function *F = CS.getCalledFunction()) {
+        StringRef FnName = F->getName();
+        if (TLI.has(LibFunc_strcpy) && FnName == TLI.getName(LibFunc_strcpy))
+          return {MemoryLocation(CS.getArgument(0))};
+        if (TLI.has(LibFunc_strncpy) && FnName == TLI.getName(LibFunc_strncpy))
+          return {MemoryLocation(CS.getArgument(0))};
+        if (TLI.has(LibFunc_strcat) && FnName == TLI.getName(LibFunc_strcat))
+          return {MemoryLocation(CS.getArgument(0))};
+        if (TLI.has(LibFunc_strncat) && FnName == TLI.getName(LibFunc_strncat))
+          return {MemoryLocation(CS.getArgument(0))};
+      }
+      return None;
+    }
+
+    return MemoryLocation::getOrNone(I);
+  }
+
+  /// Returns true if \p Use completely overwrites \p DefLoc.
+  bool isCompleteOverwrite(MemoryLocation DefLoc, Instruction *UseInst) const {
+    if (!UseInst->mayWriteToMemory())
+      return false;
+
+    if (auto CS = CallSite(UseInst))
+      if (CS.onlyAccessesInaccessibleMemory())
+        return false;
+
+    ModRefInfo MR = AA.getModRefInfo(UseInst, DefLoc);
+    // If necessary, perform additional analysis.
+    if (isModSet(MR))
+      MR = AA.callCapturesBefore(UseInst, DefLoc, &DT);
+
+    Optional<MemoryLocation> UseLoc = getLocForWriteEx(UseInst);
+    return isModSet(MR) && isMustSet(MR) &&
+           UseLoc->Size.getValue() >= DefLoc.Size.getValue();
+  }
+
+  /// Returns true if \p Use may read from \p DefLoc.
+  bool isReadClobber(MemoryLocation DefLoc, Instruction *UseInst) const {
+    if (!UseInst->mayReadFromMemory())
+      return false;
+
+    if (auto CS = CallSite(UseInst))
+      if (CS.onlyAccessesInaccessibleMemory())
+        return false;
+
+    ModRefInfo MR = AA.getModRefInfo(UseInst, DefLoc);
+    // If necessary, perform additional analysis.
+    if (isRefSet(MR))
+      MR = AA.callCapturesBefore(UseInst, DefLoc, &DT);
+    return isRefSet(MR);
+  }
+
+  // Find a MemoryDef writing to \p DefLoc and dominating \p Current, with no
+  // read access in between or return None otherwise. The returned value may not
+  // (completely) overwrite \p DefLoc. Currently we bail out when we encounter
+  // any of the following
+  //  * An aliasing MemoryUse (read).
+  //  * A MemoryPHI.
+  Optional<MemoryAccess *> getDomMemoryDef(MemoryDef *KillingDef,
+                                           MemoryAccess *Current,
+                                           MemoryLocation DefLoc,
+                                           bool DefVisibleToCaller,
+                                           int &ScanLimit) const {
+    MemoryDef *DomDef;
+    MemoryAccess *StartDef = Current;
+    bool StepAgain;
+    LLVM_DEBUG(dbgs() << "  trying to get dominating access for " << *Current
+                      << "\n");
+    // Find the next clobbering Mod access for DefLoc, starting at Current.
+    do {
+      StepAgain = false;
+      // Reached TOP.
+      if (MSSA.isLiveOnEntryDef(Current))
+        return None;
+
+      MemoryUseOrDef *CurrentUD = dyn_cast<MemoryUseOrDef>(Current);
+      if (!CurrentUD)
+        return None;
+
+      // Look for access that clobber DefLoc.
+      MemoryAccess *DomAccess =
+          MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(
+              CurrentUD->getDefiningAccess(), DefLoc);
+      DomDef = dyn_cast<MemoryDef>(DomAccess);
+      if (!DomDef || MSSA.isLiveOnEntryDef(DomDef))
+        return None;
+
+      // Check if we can skip DomDef for DSE. We also require the KillingDef
+      // execute whenever DomDef executes and use post-dominance to ensure that.
+      if (canSkipDef(DomDef, DefVisibleToCaller) ||
+          !PDT.dominates(KillingDef->getBlock(), DomDef->getBlock())) {
+        StepAgain = true;
+        Current = DomDef;
+      }
+
+    } while (StepAgain);
+
+    LLVM_DEBUG(dbgs() << "  Checking for reads of " << *DomDef << " ("
+                      << *DomDef->getMemoryInst() << ")\n");
+
+    SmallSetVector<MemoryAccess *, 32> WorkList;
+    auto PushMemUses = [&WorkList](MemoryAccess *Acc) {
+      for (Use &U : Acc->uses())
+        WorkList.insert(cast<MemoryAccess>(U.getUser()));
+    };
+    PushMemUses(DomDef);
+
+    // Check if DomDef may be read.
+    for (unsigned I = 0; I < WorkList.size(); I++) {
+      MemoryAccess *UseAccess = WorkList[I];
+
+      LLVM_DEBUG(dbgs() << "   Checking use " << *UseAccess);
+      if (--ScanLimit == 0) {
+        LLVM_DEBUG(dbgs() << "  ...  hit scan limit\n");
+        return None;
+      }
+
+      // Bail out on MemoryPhis for now.
+      if (isa<MemoryPhi>(UseAccess)) {
+        LLVM_DEBUG(dbgs() << "  ...  hit MemoryPhi\n");
+        return None;
+      }
+
+      Instruction *UseInst = cast<MemoryUseOrDef>(UseAccess)->getMemoryInst();
+      LLVM_DEBUG(dbgs() << " (" << *UseInst << ")\n");
+
+      if (isNoopIntrinsic(cast<MemoryUseOrDef>(UseAccess))) {
+        PushMemUses(UseAccess);
+        continue;
+      }
+
+      // Uses which may read the original MemoryDef mean we cannot eliminate the
+      // original MD. Stop walk.
+      if (isReadClobber(DefLoc, UseInst)) {
+        LLVM_DEBUG(dbgs() << "  ... found read clobber\n");
+        return None;
+      }
+
+      if (StartDef == UseAccess)
+        continue;
+
+      // Check all uses for MemoryDefs, except for defs completely overwriting
+      // the original location. Otherwise we have to check uses of *all*
+      // MemoryDefs we discover, including non-aliasing ones. Otherwise we might
+      // miss cases like the following
+      //   1 = Def(LoE) ; <----- DomDef stores [0,1]
+      //   2 = Def(1)   ; (2, 1) = NoAlias,   stores [2,3]
+      //   Use(2)       ; MayAlias 2 *and* 1, loads [0, 3].
+      //                  (The Use points to the *first* Def it may alias)
+      //   3 = Def(1)   ; <---- Current  (3, 2) = NoAlias, (3,1) = MayAlias,
+      //                  stores [0,1]
+      if (MemoryDef *UseDef = dyn_cast<MemoryDef>(UseAccess)) {
+        if (!isCompleteOverwrite(DefLoc, UseInst))
+          PushMemUses(UseDef);
+      }
+    }
+
+    // No aliasing MemoryUses of DomDef found, DomDef is potentially dead.
+    return {DomDef};
+  }
+
+  // Delete dead memory defs
+  void deleteDeadInstruction(Instruction *SI) {
+    MemorySSAUpdater Updater(&MSSA);
+    SmallVector<Instruction *, 32> NowDeadInsts;
+    NowDeadInsts.push_back(SI);
+    --NumFastOther;
+
+    while (!NowDeadInsts.empty()) {
+      Instruction *DeadInst = NowDeadInsts.pop_back_val();
+      ++NumFastOther;
+
+      // Remove the Instruction from MSSA.
+      if (MemoryAccess *MA = MSSA.getMemoryAccess(DeadInst)) {
+        Updater.removeMemoryAccess(MA);
+        if (MemoryDef *MD = dyn_cast<MemoryDef>(MA)) {
+          SkipStores.insert(MD);
+        }
+      }
+
+      // Remove its operands
+      for (Use &O : DeadInst->operands())
+        if (Instruction *OpI = dyn_cast<Instruction>(O)) {
+          O = nullptr;
+          if (isInstructionTriviallyDead(OpI, &TLI))
+            NowDeadInsts.push_back(OpI);
+        }
+
+      DeadInst->eraseFromParent();
+    }
+  }
+
+  // Check for any extra throws between SI and NI that block DSE.  This only
+  // checks extra maythrows (those that aren't MemoryDef's). MemoryDef that may
+  // throw are handled during the walk from one def to the next.
+  bool mayThrowBetween(Instruction *SI, Instruction *NI,
+                       const Value *SILocUnd) const {
+    // First see if we can ignore it by using the fact that SI is an
+    // alloca/alloca like object that is not visible to the caller during
+    // execution of the function.
+    if (SILocUnd && InvisibleToCaller.count(SILocUnd))
+      return false;
+
+    if (SI->getParent() == NI->getParent())
+      return ThrowingBlocks.find(SI->getParent()) != ThrowingBlocks.end();
+    return !ThrowingBlocks.empty();
+  }
+
+  // Check if \p NI acts as a DSE barrier for \p SI. The following instructions
+  // act as barriers:
+  //  * A memory instruction that may throw and \p SI accesses a non-stack
+  //  object.
+  //  * Atomic stores stronger that monotonic.
+  bool isDSEBarrier(Instruction *SI, MemoryLocation &SILoc,
+                    const Value *SILocUnd, Instruction *NI,
+                    MemoryLocation &NILoc) const {
+    // If NI may throw it acts as a barrier, unless we are to an alloca/alloca
+    // like object that does not escape.
+    if (NI->mayThrow() && !InvisibleToCaller.count(SILocUnd))
+      return true;
+
+    if (NI->isAtomic()) {
+      if (auto *NSI = dyn_cast<StoreInst>(NI)) {
+        if (isStrongerThanMonotonic(NSI->getOrdering()))
+          return true;
+      } else
+        llvm_unreachable(
+            "Other instructions should be modeled/skipped in MemorySSA");
+    }
+
+    return false;
+  }
+};
+
+bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
+                                  MemorySSA &MSSA, DominatorTree &DT,
+                                  PostDominatorTree &PDT,
+                                  const TargetLibraryInfo &TLI) {
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  bool MadeChange = false;
+
+  DSEState State = DSEState::get(F, AA, MSSA, DT, PDT, TLI);
+  // For each store:
+  for (unsigned I = 0; I < State.MemDefs.size(); I++) {
+    MemoryDef *Current = State.MemDefs[I];
+    if (State.SkipStores.count(Current))
+      continue;
+    Instruction *SI = cast<MemoryDef>(Current)->getMemoryInst();
+    MemoryLocation SILoc = *State.getLocForWriteEx(SI);
+    assert(SILoc.Ptr && "SILoc should not be null");
+    const Value *SILocUnd = GetUnderlyingObject(SILoc.Ptr, DL);
+    Instruction *DefObj =
+        const_cast<Instruction *>(dyn_cast<Instruction>(SILocUnd));
+    bool DefVisibleToCaller = !State.InvisibleToCaller.count(SILocUnd);
+    if (DefObj && ((isAllocLikeFn(DefObj, &TLI) &&
+                    !PointerMayBeCapturedBefore(DefObj, false, true, SI, &DT))))
+      DefVisibleToCaller = false;
+
+    LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs killed by " << *SI
+                      << "\n");
+
+    int ScanLimit = MemorySSAScanLimit;
+    MemoryDef *StartDef = Current;
+    Optional<MemoryAccess *> Next;
+    // Walk MemorySSA upward to find MemoryDefs that might be killed by SI.
+    while ((Next = State.getDomMemoryDef(StartDef, Current, SILoc,
+                                         DefVisibleToCaller, ScanLimit))) {
+      MemoryAccess *DomAccess = *Next;
+      LLVM_DEBUG(dbgs() << " Checking if we can kill " << *DomAccess << "\n");
+      MemoryDef *NextDef = dyn_cast<MemoryDef>(DomAccess);
+      Instruction *NI = NextDef->getMemoryInst();
+      LLVM_DEBUG(dbgs() << "  def " << *NI << "\n");
+
+      if (!hasAnalyzableMemoryWrite(NI, TLI))
+        break;
+      MemoryLocation NILoc = *State.getLocForWriteEx(NI);
+      // Check for anything that looks like it will be a barrier to further
+      // removal
+      if (State.isDSEBarrier(SI, SILoc, SILocUnd, NI, NILoc)) {
+        LLVM_DEBUG(dbgs() << "  stop, barrier\n");
+        break;
+      }
+
+      // Before we try to remove anything, check for any extra throwing
+      // instructions that block us from DSEing
+      if (State.mayThrowBetween(SI, NI, SILocUnd)) {
+        LLVM_DEBUG(dbgs() << " stop, may throw!\n");
+        break;
+      }
+
+      // Check if NI overwrites SI.
+      int64_t InstWriteOffset, DepWriteOffset;
+      InstOverlapIntervalsTy IOL;
+      OverwriteResult OR = isOverwrite(SILoc, NILoc, DL, TLI, DepWriteOffset,
+                                       InstWriteOffset, NI, IOL, AA, &F);
+
+      if (OR == OW_Complete) {
+        LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: " << *NI
+                          << "\n  KILLER: " << *SI << '\n');
+        State.deleteDeadInstruction(NI);
+        ++NumFastStores;
+        MadeChange = true;
+      } else
+        Current = NextDef;
+    }
+  }
+
+  return MadeChange;
+}
+} // end anonymous namespace
+
 //===----------------------------------------------------------------------===//
 // DSE Pass
 //===----------------------------------------------------------------------===//
 PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
-  AliasAnalysis *AA = &AM.getResult<AAManager>(F);
-  DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
-  MemoryDependenceResults *MD = &AM.getResult<MemoryDependenceAnalysis>(F);
-  const TargetLibraryInfo *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
+  AliasAnalysis &AA = AM.getResult<AAManager>(F);
+  const TargetLibraryInfo &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+
+  if (EnableMemorySSA) {
+    MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
+    PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
 
-  if (!eliminateDeadStores(F, AA, MD, DT, TLI))
-    return PreservedAnalyses::all();
+    if (!eliminateDeadStoresMemorySSA(F, AA, MSSA, DT, PDT, TLI))
+      return PreservedAnalyses::all();
+  } else {
+    MemoryDependenceResults &MD = AM.getResult<MemoryDependenceAnalysis>(F);
+
+    if (!eliminateDeadStores(F, &AA, &MD, &DT, &TLI))
+      return PreservedAnalyses::all();
+  }
 
   PreservedAnalyses PA;
   PA.preserveSet<CFGAnalyses>();
   PA.preserve<GlobalsAA>();
-  PA.preserve<MemoryDependenceAnalysis>();
+  if (EnableMemorySSA)
+    PA.preserve<MemorySSAAnalysis>();
+  else
+    PA.preserve<MemoryDependenceAnalysis>();
   return PA;
 }
 
@@ -1388,25 +1858,42 @@
     if (skipFunction(F))
       return false;
 
-    DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-    AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
-    MemoryDependenceResults *MD =
-        &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
-    const TargetLibraryInfo *TLI =
-        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    const TargetLibraryInfo &TLI =
+        getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
 
-    return eliminateDeadStores(F, AA, MD, DT, TLI);
+    if (EnableMemorySSA) {
+      MemorySSA &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
+      PostDominatorTree &PDT =
+          getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+
+      return eliminateDeadStoresMemorySSA(F, AA, MSSA, DT, PDT, TLI);
+    } else {
+      MemoryDependenceResults &MD =
+          getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
+
+      return eliminateDeadStores(F, &AA, &MD, &DT, &TLI);
+    }
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
-    AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<AAResultsWrapperPass>();
-    AU.addRequired<MemoryDependenceWrapperPass>();
     AU.addRequired<TargetLibraryInfoWrapperPass>();
-    AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
-    AU.addPreserved<MemoryDependenceWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+
+    if (EnableMemorySSA) {
+      AU.addRequired<PostDominatorTreeWrapperPass>();
+      AU.addRequired<MemorySSAWrapperPass>();
+      AU.addPreserved<PostDominatorTreeWrapperPass>();
+      AU.addPreserved<MemorySSAWrapperPass>();
+    } else {
+      AU.addRequired<MemoryDependenceWrapperPass>();
+      AU.addPreserved<MemoryDependenceWrapperPass>();
+    }
   }
 };
 
@@ -1417,8 +1904,10 @@
 INITIALIZE_PASS_BEGIN(DSELegacyPass, "dse", "Dead Store Elimination", false,
                       false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(DSELegacyPass, "dse", "Dead Store Elimination", false,
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/fence.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/fence-todo.ll
copy from llvm/test/Transforms/DeadStoreElimination/MSSA/fence.ll
copy to llvm/test/Transforms/DeadStoreElimination/MSSA/fence-todo.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/fence.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/fence-todo.ll
@@ -1,51 +1,6 @@
-; RUN: opt -S -basicaa -dse -enable-dse-memoryssa < %s | FileCheck %s
-
-; We conservative choose to prevent dead store elimination
-; across release or stronger fences.  It's not required 
-; (since the must still be a race on %addd.i), but
-; it is conservatively correct.  A legal optimization
-; could hoist the second store above the fence, and then
-; DSE one of them.
-define void @test1(i32* %addr.i) {
-; CHECK-LABEL: @test1
-; CHECK: store i32 5
-; CHECK: fence
-; CHECK: store i32 5
-; CHECK: ret
-  store i32 5, i32* %addr.i, align 4
-  fence release
-  store i32 5, i32* %addr.i, align 4
-  ret void
-}
-
-; Same as previous, but with different values.  If we ever optimize 
-; this more aggressively, this allows us to check that the correct
-; store is retained (the 'i32 1' store in this case)
-define void @test1b(i32* %addr.i) {
-; CHECK-LABEL: @test1b
-; CHECK: store i32 42
-; CHECK: fence release
-; CHECK: store i32 1
-; CHECK: ret
-  store i32 42, i32* %addr.i, align 4
-  fence release
-  store i32 1, i32* %addr.i, align 4
-  ret void
-}
+; XFAIL: *
 
-; We *could* DSE across this fence, but don't.  No other thread can
-; observe the order of the acquire fence and the store.
-define void @test2(i32* %addr.i) {
-; CHECK-LABEL: @test2
-; CHECK: store
-; CHECK: fence
-; CHECK: store
-; CHECK: ret
-  store i32 5, i32* %addr.i, align 4
-  fence acquire
-  store i32 5, i32* %addr.i, align 4
-  ret void
-}
+; RUN: opt -S -basicaa -dse -enable-dse-memoryssa < %s | FileCheck %s
 
 ; We DSE stack alloc'ed and byval locations, in the presence of fences.
 ; Fence does not make an otherwise thread local store visible.
@@ -93,4 +48,3 @@
   store i32 4, i32* %P1, align 4
   ret void
 }
-
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/fence.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/fence.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/fence.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/fence.ll
@@ -46,51 +46,3 @@
   store i32 5, i32* %addr.i, align 4
   ret void
 }
-
-; We DSE stack alloc'ed and byval locations, in the presence of fences.
-; Fence does not make an otherwise thread local store visible.
-; Right now the DSE in presence of fence is only done in end blocks (with no successors),
-; but the same logic applies to other basic blocks as well.
-; The store to %addr.i can be removed since it is a byval attribute
-define void @test3(i32* byval %addr.i) {
-; CHECK-LABEL: @test3
-; CHECK-NOT: store
-; CHECK: fence
-; CHECK: ret
-  store i32 5, i32* %addr.i, align 4
-  fence release
-  ret void
-}
-
-declare void @foo(i8* nocapture %p)
-
-declare noalias i8* @malloc(i32)
-
-; DSE of stores in locations allocated through library calls.
-define void @test_nocapture() {
-; CHECK-LABEL: @test_nocapture
-; CHECK: malloc
-; CHECK: foo
-; CHECK-NOT: store
-; CHECK: fence
-  %m  =  call i8* @malloc(i32 24)
-  call void @foo(i8* %m)
-  store i8 4, i8* %m
-  fence release
-  ret void
-}
-
-
-; This is a full fence, but it does not make a thread local store visible.
-; We can DSE the store in presence of the fence.
-define void @fence_seq_cst() {
-; CHECK-LABEL: @fence_seq_cst
-; CHECK-NEXT: fence seq_cst
-; CHECK-NEXT: ret void
-  %P1 = alloca i32
-  store i32 0, i32* %P1, align 4
-  fence seq_cst
-  store i32 4, i32* %P1, align 4
-  ret void
-}
-
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck --check-prefix=NO-LIMIT %s
+; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -dse-memoryssa-scanlimit=0 -S | FileCheck --check-prefix=LIMIT-0 %s
+; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -dse-memoryssa-scanlimit=3 -S | FileCheck --check-prefix=LIMIT-3 %s
+; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -dse-memoryssa-scanlimit=4 -S | FileCheck --check-prefix=LIMIT-4 %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+
+define void @test2(i32* noalias %P, i32* noalias %Q, i32* noalias %R) {
+;
+; NO-LIMIT-LABEL: @test2(
+; NO-LIMIT-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; NO-LIMIT:       bb1:
+; NO-LIMIT-NEXT:    br label [[BB3:%.*]]
+; NO-LIMIT:       bb2:
+; NO-LIMIT-NEXT:    br label [[BB3]]
+; NO-LIMIT:       bb3:
+; NO-LIMIT-NEXT:    store i32 0, i32* [[Q:%.*]]
+; NO-LIMIT-NEXT:    store i32 0, i32* [[R:%.*]]
+; NO-LIMIT-NEXT:    store i32 0, i32* [[P:%.*]]
+; NO-LIMIT-NEXT:    ret void
+;
+; LIMIT-0-LABEL: @test2(
+; LIMIT-0-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; LIMIT-0:       bb1:
+; LIMIT-0-NEXT:    br label [[BB3:%.*]]
+; LIMIT-0:       bb2:
+; LIMIT-0-NEXT:    br label [[BB3]]
+; LIMIT-0:       bb3:
+; LIMIT-0-NEXT:    store i32 0, i32* [[Q:%.*]]
+; LIMIT-0-NEXT:    store i32 0, i32* [[R:%.*]]
+; LIMIT-0-NEXT:    store i32 0, i32* [[P:%.*]]
+; LIMIT-0-NEXT:    ret void
+;
+; LIMIT-3-LABEL: @test2(
+; LIMIT-3-NEXT:    store i32 1, i32* [[P:%.*]]
+; LIMIT-3-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; LIMIT-3:       bb1:
+; LIMIT-3-NEXT:    br label [[BB3:%.*]]
+; LIMIT-3:       bb2:
+; LIMIT-3-NEXT:    br label [[BB3]]
+; LIMIT-3:       bb3:
+; LIMIT-3-NEXT:    store i32 0, i32* [[Q:%.*]]
+; LIMIT-3-NEXT:    store i32 0, i32* [[R:%.*]]
+; LIMIT-3-NEXT:    store i32 0, i32* [[P]]
+; LIMIT-3-NEXT:    ret void
+;
+; LIMIT-4-LABEL: @test2(
+; LIMIT-4-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; LIMIT-4:       bb1:
+; LIMIT-4-NEXT:    br label [[BB3:%.*]]
+; LIMIT-4:       bb2:
+; LIMIT-4-NEXT:    br label [[BB3]]
+; LIMIT-4:       bb3:
+; LIMIT-4-NEXT:    store i32 0, i32* [[Q:%.*]]
+; LIMIT-4-NEXT:    store i32 0, i32* [[R:%.*]]
+; LIMIT-4-NEXT:    store i32 0, i32* [[P:%.*]]
+; LIMIT-4-NEXT:    ret void
+;
+  store i32 1, i32* %P
+  br i1 true, label %bb1, label %bb2
+bb1:
+  br label %bb3
+bb2:
+  br label %bb3
+bb3:
+  store i32 0, i32* %Q
+  store i32 0, i32* %R
+  store i32 0, i32* %P
+  ret void
+}
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/memset-and-memcpy.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/memset-and-memcpy.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/memset-and-memcpy.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/memset-and-memcpy.ll
@@ -59,8 +59,7 @@
 ; Should not delete the volatile memset.
 define void @test17v(i8* %P, i8* %Q) nounwind ssp {
 ; CHECK-LABEL: @test17v(
-; CHECK-NEXT:    tail call void @llvm.memset.p0i8.i64(i8* [[P:%.*]], i8 42, i64 8, i1 true)
-; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P]], i8* [[Q:%.*]], i64 12, i1 false)
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P:%.*]], i8* [[Q:%.*]], i64 12, i1 false)
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.memset.p0i8.i64(i8* %P, i8 42, i64 8, i1 true)
@@ -74,8 +73,7 @@
 ; Previously this was not allowed (PR8728), also discussed in PR11763.
 define void @test18(i8* %P, i8* %Q, i8* %R) nounwind ssp {
 ; CHECK-LABEL: @test18(
-; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P:%.*]], i8* [[Q:%.*]], i64 12, i1 false)
-; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P]], i8* [[R:%.*]], i64 12, i1 false)
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P:%.*]], i8* [[R:%.*]], i64 12, i1 false)
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false)
@@ -85,8 +83,7 @@
 
 define void @test18_atomic(i8* %P, i8* %Q, i8* %R) nounwind ssp {
 ; CHECK-LABEL: @test18_atomic(
-; CHECK-NEXT:    tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i32 1)
-; CHECK-NEXT:    tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P]], i8* align 1 [[R:%.*]], i64 12, i32 1)
+; CHECK-NEXT:    tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[R:%.*]], i64 12, i32 1)
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1)
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-captures.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-captures.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-captures.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-captures.ll
@@ -25,7 +25,6 @@
 define i8* @test_return_captures_2() {
 ; CHECK-LABEL: @test_return_captures_2(
 ; CHECK-NEXT:    [[M:%.*]] = call i8* @malloc(i64 24)
-; CHECK-NEXT:    store i8 0, i8* [[M]]
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    store i8 1, i8* [[M]]
@@ -91,7 +90,6 @@
 define i8* @test_malloc_capture_3() {
 ; CHECK-LABEL: @test_malloc_capture_3(
 ; CHECK-NEXT:    [[M:%.*]] = call i8* @malloc(i64 24)
-; CHECK-NEXT:    store i8 0, i8* [[M]]
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    store i8 1, i8* [[M]]
@@ -191,7 +189,6 @@
 define i8* @test_malloc_capture_7() {
 ; CHECK-LABEL: @test_malloc_capture_7(
 ; CHECK-NEXT:    [[M:%.*]] = call i8* @malloc(i64 24)
-; CHECK-NEXT:    store i8 0, i8* [[M]]
 ; CHECK-NEXT:    call void @may_throw()
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       exit:
@@ -215,10 +212,10 @@
 define void @test_alloca_nocapture_1() {
 ; CHECK-LABEL: @test_alloca_nocapture_1(
 ; CHECK-NEXT:    [[M:%.*]] = alloca i8
-; CHECK-NEXT:    store i8 0, i8* [[M]]
 ; CHECK-NEXT:    call void @foo()
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       exit:
+; CHECK-NEXT:    store i8 1, i8* [[M]]
 ; CHECK-NEXT:    ret void
 ;
   %m = alloca i8
@@ -240,6 +237,7 @@
 ; CHECK-NEXT:    call void @capture(i8* [[M]])
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       exit:
+; CHECK-NEXT:    store i8 1, i8* [[M]]
 ; CHECK-NEXT:    ret void
 ;
   %m = alloca i8
@@ -262,6 +260,7 @@
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[F_PTR:%.*]] = getelementptr [[S1:%.*]], %S1* [[E:%.*]], i32 0, i32 0
 ; CHECK-NEXT:    store i8* [[M]], i8** [[F_PTR]]
+; CHECK-NEXT:    store i8 1, i8* [[M]]
 ; CHECK-NEXT:    ret void
 ;
   %m = alloca i8
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-exceptions.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-exceptions.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-exceptions.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-exceptions.ll
@@ -30,6 +30,7 @@
 ; CHECK-NEXT:    [[C1:%.*]] = cleanuppad within none []
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
+; CHECK-NEXT:    store i32 40, i32* [[SV]]
 ; CHECK-NEXT:    ret void
 ;
 block1:
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll
@@ -111,7 +111,6 @@
 ; CHECK:       for.body4.lr.ph:
 ; CHECK-NEXT:    [[I_028:%.*]] = phi i32 [ [[INC11:%.*]], [[FOR_COND_CLEANUP3:%.*]] ], [ 0, [[FOR_BODY4_LR_PH_PREHEADER]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[I_028]]
-; CHECK-NEXT:    store i32 0, i32* [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[I_028]], [[N]]
 ; CHECK-NEXT:    br label [[FOR_BODY4:%.*]]
 ; CHECK:       for.body4:
@@ -170,3 +169,116 @@
   %exitcond29 = icmp eq i32 %inc11, %N
   br i1 %exitcond29, label %for.cond.cleanup, label %for.body4.lr.ph
 }
+
+declare i1 @cond() readnone
+
+; TODO: We can eliminate the store in for.header, but we currently hit a MemoryPhi.
+define void @loop_multiple_def_uses(i32* noalias %P) {
+; CHECK-LABEL: @loop_multiple_def_uses(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_HEADER:%.*]]
+; CHECK:       for.header:
+; CHECK-NEXT:    store i32 1, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    [[C1:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[C1]], label [[FOR_BODY:%.*]], label [[END:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    store i32 1, i32* [[P]], align 4
+; CHECK-NEXT:    [[LV:%.*]] = load i32, i32* [[P]]
+; CHECK-NEXT:    br label [[FOR_HEADER]]
+; CHECK:       end:
+; CHECK-NEXT:    store i32 3, i32* [[P]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.header
+
+for.header:
+  store i32 1, i32* %P, align 4
+  %c1 = call i1 @cond()
+  br i1 %c1, label %for.body, label %end
+
+for.body:
+  store i32 1, i32* %P, align 4
+  %lv = load i32, i32* %P
+  br label %for.header
+
+end:
+  store i32 3, i32* %P, align 4
+  ret void
+}
+
+; We cannot eliminate the store in for.header, as it is only partially
+; overwritten in for.body and read afterwards.
+define void @loop_multiple_def_uses_partial_write(i32* noalias %p) {
+; CHECK-LABEL: @loop_multiple_def_uses_partial_write(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_HEADER:%.*]]
+; CHECK:       for.header:
+; CHECK-NEXT:    store i32 1239491, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    [[C1:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[C1]], label [[FOR_BODY:%.*]], label [[END:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[C:%.*]] = bitcast i32* [[P]] to i8*
+; CHECK-NEXT:    store i8 1, i8* [[C]], align 4
+; CHECK-NEXT:    [[LV:%.*]] = load i32, i32* [[P]]
+; CHECK-NEXT:    br label [[FOR_HEADER]]
+; CHECK:       end:
+; CHECK-NEXT:    store i32 3, i32* [[P]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.header
+
+for.header:
+  store i32 1239491, i32* %p, align 4
+  %c1 = call i1 @cond()
+  br i1 %c1, label %for.body, label %end
+
+for.body:
+  %c = bitcast i32* %p to i8*
+  store i8 1, i8* %c, align 4
+  %lv = load i32, i32* %p
+  br label %for.header
+
+end:
+  store i32 3, i32* %p, align 4
+  ret void
+}
+
+; We cannot eliminate the store in for.header, as the location is not overwritten
+; in for.body and read afterwards.
+define void @loop_multiple_def_uses_mayalias_write(i32* %p, i32* %q) {
+
+; CHECK-LABEL: @loop_multiple_def_uses_mayalias_write(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_HEADER:%.*]]
+; CHECK:       for.header:
+; CHECK-NEXT:    store i32 1239491, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    [[C1:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[C1]], label [[FOR_BODY:%.*]], label [[END:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    store i32 1, i32* [[Q:%.*]], align 4
+; CHECK-NEXT:    [[LV:%.*]] = load i32, i32* [[P]]
+; CHECK-NEXT:    br label [[FOR_HEADER]]
+; CHECK:       end:
+; CHECK-NEXT:    store i32 3, i32* [[P]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.header
+
+for.header:
+  store i32 1239491, i32* %p, align 4
+  %c1 = call i1 @cond()
+  br i1 %c1, label %for.body, label %end
+
+for.body:
+  store i32 1, i32* %q, align 4
+  %lv = load i32, i32* %p
+  br label %for.header
+
+end:
+  store i32 3, i32* %p, align 4
+  ret void
+}
+
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memoryphis.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memoryphis.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memoryphis.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memoryphis.ll
@@ -103,3 +103,73 @@
   store i8 1, i8* %P2
   ret void
 }
+
+declare void @hoge()
+
+; Check a function with a MemoryPhi with 3 incoming values.
+define void @widget(i32* %Ptr, i1 %c1, i1 %c2, i32 %v1, i32 %v2, i32 %v3) {
+; CHECK-LABEL: @widget(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    tail call void @hoge()
+; CHECK-NEXT:    br i1 [[C1:%.*]], label [[BB3:%.*]], label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br i1 [[C2:%.*]], label [[BB2:%.*]], label [[BB3]]
+; CHECK:       bb2:
+; CHECK-NEXT:    store i32 -1, i32* [[PTR:%.*]], align 4
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    br label [[BB4:%.*]]
+; CHECK:       bb4:
+; CHECK-NEXT:    switch i32 [[V1:%.*]], label [[BB8:%.*]] [
+; CHECK-NEXT:    i32 0, label [[BB5:%.*]]
+; CHECK-NEXT:    i32 1, label [[BB6:%.*]]
+; CHECK-NEXT:    i32 2, label [[BB7:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb5:
+; CHECK-NEXT:    store i32 0, i32* [[PTR]], align 4
+; CHECK-NEXT:    br label [[BB8]]
+; CHECK:       bb6:
+; CHECK-NEXT:    store i32 1, i32* [[PTR]], align 4
+; CHECK-NEXT:    br label [[BB8]]
+; CHECK:       bb7:
+; CHECK-NEXT:    store i32 2, i32* [[PTR]], align 4
+; CHECK-NEXT:    br label [[BB8]]
+; CHECK:       bb8:
+; CHECK-NEXT:    br label [[BB4]]
+;
+bb:
+  tail call void @hoge()
+  br i1 %c1, label %bb3, label %bb1
+
+bb1:                                              ; preds = %bb
+  br i1 %c2, label %bb2, label %bb3
+
+bb2:                                              ; preds = %bb1
+  store i32 -1, i32* %Ptr, align 4
+  br label %bb3
+
+bb3:                                              ; preds = %bb2, %bb1, %bb
+  br label %bb4
+
+bb4:                                              ; preds = %bb8, %bb3
+  switch i32 %v1, label %bb8 [
+  i32 0, label %bb5
+  i32 1, label %bb6
+  i32 2, label %bb7
+  ]
+
+bb5:                                              ; preds = %bb4
+  store i32 0, i32* %Ptr, align 4
+  br label %bb8
+
+bb6:                                              ; preds = %bb4
+  store i32 1, i32* %Ptr, align 4
+  br label %bb8
+
+bb7:                                              ; preds = %bb4
+  store i32 2, i32* %Ptr, align 4
+  br label %bb8
+
+bb8:                                              ; preds = %bb7, %bb6, %bb5, %bb4
+  br label %bb4
+}
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-partial.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-partial.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-partial.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-partial.ll
@@ -32,14 +32,13 @@
 
 define void @second_store_bigger(i32* noalias %P) {
 ; CHECK-LABEL: @second_store_bigger(
-; CHECK-NEXT:    store i32 1, i32* [[P:%.*]]
 ; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[P_I64:%.*]] = bitcast i32* [[P]] to i64*
+; CHECK-NEXT:    [[P_I64:%.*]] = bitcast i32* [[P:%.*]] to i64*
 ; CHECK-NEXT:    store i64 0, i64* [[P_I64]]
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll
@@ -6,14 +6,13 @@
 
 define void @test2(i32* noalias %P) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    store i32 1, i32* [[P:%.*]]
 ; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    store i32 0, i32* [[P]]
+; CHECK-NEXT:    store i32 0, i32* [[P:%.*]]
 ; CHECK-NEXT:    ret void
 ;
   store i32 1, i32* %P
@@ -53,7 +52,6 @@
 
 define void @test7(i32* noalias %P, i32* noalias %Q) {
 ; CHECK-LABEL: @test7(
-; CHECK-NEXT:    store i32 1, i32* [[Q:%.*]]
 ; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[P:%.*]]
@@ -61,7 +59,7 @@
 ; CHECK:       bb2:
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    store i32 0, i32* [[Q]]
+; CHECK-NEXT:    store i32 0, i32* [[Q:%.*]]
 ; CHECK-NEXT:    store i32 0, i32* [[P]]
 ; CHECK-NEXT:    ret void
 ;
@@ -114,3 +112,38 @@
   store i32 0, i32* %P
   ret void
 }
+
+; We cannot eliminate `store i32 0, i32* %P`, as it is read by the later load.
+; Make sure that we check the uses of `store i32 1, i32* %P.1 which does not
+; alias %P. Note that uses point to the *first* def that may alias.
+define void @overlapping_read(i32* %P) {
+; CHECK-LABEL: @overlapping_read(
+; CHECK-NEXT:    store i32 0, i32* [[P:%.*]]
+; CHECK-NEXT:    [[P_1:%.*]] = getelementptr i32, i32* [[P]], i32 1
+; CHECK-NEXT:    store i32 1, i32* [[P_1]]
+; CHECK-NEXT:    [[P_64:%.*]] = bitcast i32* [[P]] to i64*
+; CHECK-NEXT:    [[LV:%.*]] = load i64, i64* [[P_64]]
+; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    ret void
+; CHECK:       bb3:
+; CHECK-NEXT:    store i32 2, i32* [[P]]
+; CHECK-NEXT:    ret void
+;
+  store i32 0, i32* %P
+  %P.1 = getelementptr i32, i32* %P, i32 1
+  store i32 1, i32* %P.1
+
+  %P.64 = bitcast i32* %P to i64*
+  %lv = load i64, i64* %P.64
+  br i1 true, label %bb1, label %bb2
+bb1:
+  br label %bb3
+bb2:
+  ret void
+bb3:
+  store i32 2, i32* %P
+  ret void
+}
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll
@@ -294,8 +294,7 @@
 ; The memmove is dead, because memcpy arguments cannot overlap.
 define void @test38(i8* %P, i8* %Q, i8* %R) {
 ; CHECK-LABEL: @test38(
-; CHECK-NEXT:    tail call void @llvm.memmove.p0i8.p0i8.i64(i8* [[P:%.*]], i8* [[Q:%.*]], i64 12, i1 false)
-; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P]], i8* [[R:%.*]], i64 12, i1 false)
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P:%.*]], i8* [[R:%.*]], i64 12, i1 false)
 ; CHECK-NEXT:    ret void
 ;
 
@@ -307,8 +306,7 @@
 ; The memmove is dead, because memcpy arguments cannot overlap.
 define void @test38_atomic(i8* %P, i8* %Q, i8* %R) {
 ; CHECK-LABEL: @test38_atomic(
-; CHECK-NEXT:    tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i32 1)
-; CHECK-NEXT:    tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P]], i8* align 1 [[R:%.*]], i64 12, i32 1)
+; CHECK-NEXT:    tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[R:%.*]], i64 12, i32 1)
 ; CHECK-NEXT:    ret void
 ;
 
@@ -379,7 +377,9 @@
 define void @test41(i32* noalias %P) {
 ; CHECK-LABEL: @test41(
 ; CHECK-NEXT:    [[P2:%.*]] = bitcast i32* [[P:%.*]] to i8*
+; CHECK-NEXT:    store i32 1, i32* [[P]]
 ; CHECK-NEXT:    call void @unknown_func()
+; CHECK-NEXT:    store i32 2, i32* [[P]]
 ; CHECK-NEXT:    call void @free(i8* [[P2]])
 ; CHECK-NEXT:    ret void
 ;