diff --git a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
--- a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
+++ b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
@@ -43,6 +43,7 @@
   AliasAnalysis *AA = nullptr;
   AssumptionCache *AC = nullptr;
   DominatorTree *DT = nullptr;
+  MemorySSA *MSSA = nullptr;
   MemorySSAUpdater *MSSAU = nullptr;
 
 public:
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -67,7 +67,6 @@
 
 #define DEBUG_TYPE "memcpyopt"
 
-// TODO: Actually implement MemorySSA-based MemCpyOpt.
 static cl::opt<bool>
     EnableMemorySSA("enable-memcpyopt-memoryssa", cl::init(false), cl::Hidden,
                     cl::desc("Use MemorySSA-backed MemCpyOpt."));
@@ -283,7 +282,8 @@
     AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
     AU.addRequired<TargetLibraryInfoWrapperPass>();
-    AU.addRequired<MemoryDependenceWrapperPass>();
+    if (!EnableMemorySSA)
+      AU.addRequired<MemoryDependenceWrapperPass>();
     AU.addPreserved<MemoryDependenceWrapperPass>();
     AU.addRequired<AAResultsWrapperPass>();
     AU.addPreserved<AAResultsWrapperPass>();
@@ -330,10 +330,37 @@
 void MemCpyOptPass::eraseInstruction(Instruction *I) {
   if (MSSAU)
     MSSAU->removeMemoryAccess(I);
-  MD->removeInstruction(I);
+  if (MD)
+    MD->removeInstruction(I);
   I->eraseFromParent();
 }
 
+// Check for mod or ref of Loc between Start and End, excluding both boundaries.
+// Start and End must be in the same block
+static bool accessedBetween(AliasAnalysis &AA, MemoryLocation Loc,
+                            const MemoryUseOrDef *Start,
+                            const MemoryUseOrDef *End) {
+  assert(Start->getBlock() == End->getBlock() && "Only local supported");
+  for (const MemoryAccess &MA :
+       make_range(++Start->getIterator(), End->getIterator())) {
+    if (isModOrRefSet(AA.getModRefInfo(cast<MemoryUseOrDef>(MA).getMemoryInst(),
+                                       Loc)))
+      return true;
+  }
+  return false;
+}
+
+// Check for mod of Loc between Start and End, excluding both boundaries.
+// Start and End can be in different blocks.
+static bool writtenBetween(MemorySSA *MSSA, MemoryLocation Loc,
+                           const MemoryUseOrDef *Start,
+                           const MemoryUseOrDef *End) {
+  // TODO: Only walk until we hit Start.
+  MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
+      End->getDefiningAccess(), Loc);
+  return !MSSA->dominates(Clobber, Start);
+}
+
 /// When scanning forward over instructions, we look for some other patterns to
 /// fold away. In particular, this looks for stores to neighboring locations of
 /// memory. If it sees enough consecutive ones, it attempts to merge them
@@ -645,6 +672,7 @@
         // the memory we load from in between the load and the store. If
         // such an instruction is found, we try to promote there instead
         // of at the store position.
+        // TODO: Can use MSSA for this.
         Instruction *P = SI;
         for (auto &I : make_range(++LI->getIterator(), SI->getIterator())) {
           if (isModSet(AA->getModRefInfo(&I, LoadLoc))) {
@@ -709,20 +737,37 @@
       // Detect cases where we're performing call slot forwarding, but
       // happen to be using a load-store pair to implement it, rather than
       // a memcpy.
-      MemDepResult ldep = MD->getDependency(LI);
       CallInst *C = nullptr;
-      if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst()))
-        C = dyn_cast<CallInst>(ldep.getInst());
+      if (EnableMemorySSA) {
+        if (auto *LoadClobber = dyn_cast<MemoryUseOrDef>(
+                MSSA->getWalker()->getClobberingMemoryAccess(LI))) {
+          // The load most post-dom the call. Limit to the same block for now.
+          // TODO: Support non-local call-slot optimization?
+          if (LoadClobber->getBlock() == SI->getParent())
+            C = dyn_cast_or_null<CallInst>(LoadClobber->getMemoryInst());
+        }
+      } else {
+        MemDepResult ldep = MD->getDependency(LI);
+        if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst()))
+          C = dyn_cast<CallInst>(ldep.getInst());
+      }
 
       if (C) {
         // Check that nothing touches the dest of the "copy" between
         // the call and the store.
         MemoryLocation StoreLoc = MemoryLocation::get(SI);
-        for (BasicBlock::iterator I = --SI->getIterator(), E = C->getIterator();
-             I != E; --I) {
-          if (isModOrRefSet(AA->getModRefInfo(&*I, StoreLoc))) {
+        if (EnableMemorySSA) {
+          if (accessedBetween(*AA, StoreLoc, MSSA->getMemoryAccess(C),
+                              MSSA->getMemoryAccess(SI)))
             C = nullptr;
-            break;
+        } else {
+          for (BasicBlock::iterator I = --SI->getIterator(),
+                                    E = C->getIterator();
+               I != E; --I) {
+            if (isModOrRefSet(AA->getModRefInfo(&*I, StoreLoc))) {
+              C = nullptr;
+              break;
+            }
           }
         }
       }
@@ -972,7 +1017,8 @@
 
   // Drop any cached information about the call, because we may have changed
   // its dependence information by changing its parameter.
-  MD->removeInstruction(C);
+  if (MD)
+    MD->removeInstruction(C);
 
   // Update AA metadata
   // FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be
@@ -1020,14 +1066,21 @@
   //
   // TODO: If the code between M and MDep is transparent to the destination "c",
   // then we could still perform the xform by moving M up to the first memcpy.
-  //
-  // NOTE: This is conservative, it will stop on any read from the source loc,
-  // not just the defining memcpy.
-  MemDepResult SourceDep =
-      MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false,
-                                   M->getIterator(), M->getParent());
-  if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
-    return false;
+  if (EnableMemorySSA) {
+    // TODO: It would be sufficient to check the MDep source up to the memcpy
+    // size of M, rather than MDep.
+    if (writtenBetween(MSSA, MemoryLocation::getForSource(MDep),
+                       MSSA->getMemoryAccess(MDep), MSSA->getMemoryAccess(M)))
+      return false;
+  } else {
+    // NOTE: This is conservative, it will stop on any read from the source loc,
+    // not just the defining memcpy.
+    MemDepResult SourceDep =
+        MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false,
+                                     M->getIterator(), M->getParent());
+    if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
+      return false;
+  }
 
   // If the dest of the second might alias the source of the first, then the
   // source and dest might overlap.  We still want to eliminate the intermediate
@@ -1095,12 +1148,24 @@
                                     LocationSize::precise(1))))
     return false;
 
-  // Check that there are no other dependencies on the memset destination.
-  MemDepResult DstDepInfo =
-      MD->getPointerDependencyFrom(MemoryLocation::getForDest(MemSet), false,
-                                   MemCpy->getIterator(), MemCpy->getParent());
-  if (DstDepInfo.getInst() != MemSet)
-    return false;
+  if (EnableMemorySSA) {
+    // We know that dst up to src_size is not written. We now need to make sure
+    // that dst up to dst_size is not accessed. (If we did not move the memset,
+    // checking for reads would be sufficient.)
+    if (accessedBetween(*AA, MemoryLocation::getForDest(MemSet),
+                        MSSA->getMemoryAccess(MemSet),
+                        MSSA->getMemoryAccess(MemCpy))) {
+      return false;
+    }
+  } else {
+    // We have already checked that dst up to src_size is not accessed. We
+    // need to make sure that there are no accesses up to dst_size either.
+    MemDepResult DstDepInfo = MD->getPointerDependencyFrom(
+        MemoryLocation::getForDest(MemSet), false, MemCpy->getIterator(),
+        MemCpy->getParent());
+    if (DstDepInfo.getInst() != MemSet)
+      return false;
+  }
 
   // Use the same i8* dest as the memcpy, killing the memset dest if different.
   Value *Dest = MemCpy->getRawDest();
@@ -1172,6 +1237,24 @@
   return false;
 }
 
+static bool hasUndefContentsMSSA(MemorySSA *MSSA, AliasAnalysis *AA, Value *V,
+                                 MemoryDef *Def, ConstantInt *Size) {
+  if (MSSA->isLiveOnEntryDef(Def))
+    return isa<AllocaInst>(getUnderlyingObject(V));
+
+  if (IntrinsicInst *II =
+          dyn_cast_or_null<IntrinsicInst>(Def->getMemoryInst())) {
+    if (II->getIntrinsicID() == Intrinsic::lifetime_start) {
+      ConstantInt *LTSize = cast<ConstantInt>(II->getArgOperand(0));
+      if (AA->isMustAlias(V, II->getArgOperand(1)) &&
+          LTSize->getZExtValue() >= Size->getZExtValue())
+        return true;
+    }
+  }
+
+  return false;
+}
+
 /// Transform memcpy to memset when its source was just memset.
 /// In other words, turn:
 /// \code
@@ -1207,12 +1290,24 @@
     // interested in the bytes from MemSetSize..CopySize here, but as we can't
     // easily represent this location, we use the full 0..CopySize range.
     MemoryLocation MemCpyLoc = MemoryLocation::getForSource(MemCpy);
-    MemDepResult DepInfo = MD->getPointerDependencyFrom(
-        MemCpyLoc, true, MemSet->getIterator(), MemSet->getParent());
-    if (DepInfo.isDef() && hasUndefContents(DepInfo.getInst(), CopySize))
-      CopySize = MemSetSize;
-    else
+    bool CanReduceSize = false;
+    if (EnableMemorySSA) {
+      MemoryUseOrDef *MemSetAccess = MSSA->getMemoryAccess(MemSet);
+      MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
+          MemSetAccess->getDefiningAccess(), MemCpyLoc);
+      if (auto *MD = dyn_cast<MemoryDef>(Clobber))
+        if (hasUndefContentsMSSA(MSSA, AA, MemCpy->getSource(), MD, CopySize))
+          CanReduceSize = true;
+    } else {
+      MemDepResult DepInfo = MD->getPointerDependencyFrom(
+          MemCpyLoc, true, MemSet->getIterator(), MemSet->getParent());
+      if (DepInfo.isDef() && hasUndefContents(DepInfo.getInst(), CopySize))
+        CanReduceSize = true;
+    }
+
+    if (!CanReduceSize)
       return false;
+    CopySize = MemSetSize;
   }
 
   IRBuilder<> Builder(MemCpy);
@@ -1267,63 +1362,140 @@
         return true;
       }
 
-  MemDepResult DepInfo = MD->getDependency(M);
-
-  // Try to turn a partially redundant memset + memcpy into
-  // memcpy + smaller memset.  We don't need the memcpy size for this.
-  if (DepInfo.isClobber())
-    if (MemSetInst *MDep = dyn_cast<MemSetInst>(DepInfo.getInst()))
-      if (processMemSetMemCpyDependence(M, MDep))
-        return true;
+  if (EnableMemorySSA) {
+    MemoryUseOrDef *MA = MSSA->getMemoryAccess(M);
+    MemoryAccess *AnyClobber = MSSA->getWalker()->getClobberingMemoryAccess(MA);
+    MemoryLocation DestLoc = MemoryLocation::getForDest(M);
+    const MemoryAccess *DestClobber =
+        MSSA->getWalker()->getClobberingMemoryAccess(AnyClobber, DestLoc);
+
+    // Try to turn a partially redundant memset + memcpy into
+    // memcpy + smaller memset.  We don't need the memcpy size for this.
+    // The memcpy most post-dom the memset, so limit this to the same basic
+    // block. A non-local generalization is likely not worthwhile.
+    if (auto *MD = dyn_cast<MemoryDef>(DestClobber))
+      if (auto *MDep = dyn_cast_or_null<MemSetInst>(MD->getMemoryInst()))
+        if (DestClobber->getBlock() == M->getParent())
+          if (processMemSetMemCpyDependence(M, MDep))
+            return true;
+
+    // The optimizations after this point require the memcpy size.
+    ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength());
+    if (!CopySize) return false;
+
+    MemoryAccess *SrcClobber = MSSA->getWalker()->getClobberingMemoryAccess(
+        AnyClobber, MemoryLocation::getForSource(M));
+
+    // There are four possible optimizations we can do for memcpy:
+    //   a) memcpy-memcpy xform which exposes redundance for DSE.
+    //   b) call-memcpy xform for return slot optimization.
+    //   c) memcpy from freshly alloca'd space or space that has just started
+    //      its lifetime copies undefined data, and we can therefore eliminate
+    //      the memcpy in favor of the data that was already at the destination.
+    //   d) memcpy from a just-memset'd source can be turned into memset.
+    if (auto *MD = dyn_cast<MemoryDef>(SrcClobber)) {
+      if (Instruction *MI = MD->getMemoryInst()) {
+        if (auto *C = dyn_cast<CallInst>(MI)) {
+          // The memcpy must post-dom the call. Limit to the same block for now.
+          // Additionally, we need to ensure that there are no accesses to dest
+          // between the call and the memcpy. Accesses to src will be checked
+          // by performCallSlotOptzn().
+          // TODO: Support non-local call-slot optimization?
+          if (C->getParent() == M->getParent() &&
+              !accessedBetween(*AA, DestLoc, MD, MA)) {
+            // FIXME: Can we pass in either of dest/src alignment here instead
+            // of conservatively taking the minimum?
+            Align Alignment = std::min(M->getDestAlign().valueOrOne(),
+                                       M->getSourceAlign().valueOrOne());
+            if (performCallSlotOptzn(M, M, M->getDest(), M->getSource(),
+                                     CopySize->getZExtValue(), Alignment, C)) {
+              LLVM_DEBUG(dbgs() << "Performed call slot optimization:\n"
+                                << "    call: " << *C << "\n"
+                                << "    memcpy: " << *M << "\n");
+              eraseInstruction(M);
+              ++NumMemCpyInstr;
+              return true;
+            }
+          }
+        }
+        if (auto *MDep = dyn_cast<MemCpyInst>(MI))
+          return processMemCpyMemCpyDependence(M, MDep);
+        if (auto *MDep = dyn_cast<MemSetInst>(MI)) {
+          if (performMemCpyToMemSetOptzn(M, MDep)) {
+            LLVM_DEBUG(dbgs() << "Converted memcpy to memset\n");
+            eraseInstruction(M);
+            ++NumCpyToSet;
+            return true;
+          }
+        }
+      }
 
-  // The optimizations after this point require the memcpy size.
-  ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength());
-  if (!CopySize) return false;
-
-  // There are four possible optimizations we can do for memcpy:
-  //   a) memcpy-memcpy xform which exposes redundance for DSE.
-  //   b) call-memcpy xform for return slot optimization.
-  //   c) memcpy from freshly alloca'd space or space that has just started its
-  //      lifetime copies undefined data, and we can therefore eliminate the
-  //      memcpy in favor of the data that was already at the destination.
-  //   d) memcpy from a just-memset'd source can be turned into memset.
-  if (DepInfo.isClobber()) {
-    if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) {
-      // FIXME: Can we pass in either of dest/src alignment here instead
-      // of conservatively taking the minimum?
-      Align Alignment = std::min(M->getDestAlign().valueOrOne(),
-                                 M->getSourceAlign().valueOrOne());
-      if (performCallSlotOptzn(M, M, M->getDest(), M->getSource(),
-                               CopySize->getZExtValue(), Alignment, C)) {
+      if (hasUndefContentsMSSA(MSSA, AA, M->getSource(), MD, CopySize)) {
+        LLVM_DEBUG(dbgs() << "Removed memcpy from undef\n");
         eraseInstruction(M);
         ++NumMemCpyInstr;
         return true;
       }
     }
-  }
+  } else {
+    MemDepResult DepInfo = MD->getDependency(M);
+
+    // Try to turn a partially redundant memset + memcpy into
+    // memcpy + smaller memset.  We don't need the memcpy size for this.
+    if (DepInfo.isClobber())
+      if (MemSetInst *MDep = dyn_cast<MemSetInst>(DepInfo.getInst()))
+        if (processMemSetMemCpyDependence(M, MDep))
+          return true;
 
-  MemoryLocation SrcLoc = MemoryLocation::getForSource(M);
-  MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(
-      SrcLoc, true, M->getIterator(), M->getParent());
-
-  if (SrcDepInfo.isClobber()) {
-    if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst()))
-      return processMemCpyMemCpyDependence(M, MDep);
-  } else if (SrcDepInfo.isDef()) {
-    if (hasUndefContents(SrcDepInfo.getInst(), CopySize)) {
-      eraseInstruction(M);
-      ++NumMemCpyInstr;
-      return true;
+    // The optimizations after this point require the memcpy size.
+    ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength());
+    if (!CopySize) return false;
+
+    // There are four possible optimizations we can do for memcpy:
+    //   a) memcpy-memcpy xform which exposes redundance for DSE.
+    //   b) call-memcpy xform for return slot optimization.
+    //   c) memcpy from freshly alloca'd space or space that has just started
+    //      its lifetime copies undefined data, and we can therefore eliminate
+    //      the memcpy in favor of the data that was already at the destination.
+    //   d) memcpy from a just-memset'd source can be turned into memset.
+    if (DepInfo.isClobber()) {
+      if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) {
+        // FIXME: Can we pass in either of dest/src alignment here instead
+        // of conservatively taking the minimum?
+        Align Alignment = std::min(M->getDestAlign().valueOrOne(),
+                                   M->getSourceAlign().valueOrOne());
+        if (performCallSlotOptzn(M, M, M->getDest(), M->getSource(),
+                                 CopySize->getZExtValue(), Alignment, C)) {
+          eraseInstruction(M);
+          ++NumMemCpyInstr;
+          return true;
+        }
+      }
     }
-  }
 
-  if (SrcDepInfo.isClobber())
-    if (MemSetInst *MDep = dyn_cast<MemSetInst>(SrcDepInfo.getInst()))
-      if (performMemCpyToMemSetOptzn(M, MDep)) {
+    MemoryLocation SrcLoc = MemoryLocation::getForSource(M);
+    MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(
+        SrcLoc, true, M->getIterator(), M->getParent());
+
+    if (SrcDepInfo.isClobber()) {
+      if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst()))
+        return processMemCpyMemCpyDependence(M, MDep);
+    } else if (SrcDepInfo.isDef()) {
+      if (hasUndefContents(SrcDepInfo.getInst(), CopySize)) {
         eraseInstruction(M);
-        ++NumCpyToSet;
+        ++NumMemCpyInstr;
         return true;
       }
+    }
+
+    if (SrcDepInfo.isClobber())
+      if (MemSetInst *MDep = dyn_cast<MemSetInst>(SrcDepInfo.getInst()))
+        if (performMemCpyToMemSetOptzn(M, MDep)) {
+          eraseInstruction(M);
+          ++NumCpyToSet;
+          return true;
+        }
+  }
 
   return false;
 }
@@ -1354,7 +1526,8 @@
 
   // MemDep may have over conservative information about this instruction, just
   // conservatively flush it from the cache.
-  MD->removeInstruction(M);
+  if (MD)
+    MD->removeInstruction(M);
 
   ++NumMoveToCpy;
   return true;
@@ -1367,16 +1540,25 @@
   Value *ByValArg = CB.getArgOperand(ArgNo);
   Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType();
   uint64_t ByValSize = DL.getTypeAllocSize(ByValTy);
-  MemDepResult DepInfo = MD->getPointerDependencyFrom(
-      MemoryLocation(ByValArg, LocationSize::precise(ByValSize)), true,
-      CB.getIterator(), CB.getParent());
-  if (!DepInfo.isClobber())
-    return false;
+  MemoryLocation Loc(ByValArg, LocationSize::precise(ByValSize));
+  MemCpyInst *MDep = nullptr;
+  if (EnableMemorySSA) {
+    MemoryUseOrDef *CallAccess = MSSA->getMemoryAccess(&CB);
+    MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
+        CallAccess->getDefiningAccess(), Loc);
+    if (auto *MD = dyn_cast<MemoryDef>(Clobber))
+      MDep = dyn_cast_or_null<MemCpyInst>(MD->getMemoryInst());
+  } else {
+    MemDepResult DepInfo = MD->getPointerDependencyFrom(
+        Loc, true, CB.getIterator(), CB.getParent());
+    if (!DepInfo.isClobber())
+      return false;
+    MDep = dyn_cast<MemCpyInst>(DepInfo.getInst());
+  }
 
   // If the byval argument isn't fed by a memcpy, ignore it.  If it is fed by
   // a memcpy, see if we can byval from the source of the memcpy instead of the
   // result.
-  MemCpyInst *MDep = dyn_cast<MemCpyInst>(DepInfo.getInst());
   if (!MDep || MDep->isVolatile() ||
       ByValArg->stripPointerCasts() != MDep->getDest())
     return false;
@@ -1410,14 +1592,19 @@
   //    *b = 42;
   //    foo(*a)
   // It would be invalid to transform the second memcpy into foo(*b).
-  //
-  // NOTE: This is conservative, it will stop on any read from the source loc,
-  // not just the defining memcpy.
-  MemDepResult SourceDep = MD->getPointerDependencyFrom(
-      MemoryLocation::getForSource(MDep), false,
-      CB.getIterator(), MDep->getParent());
-  if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
-    return false;
+  if (EnableMemorySSA) {
+    if (writtenBetween(MSSA, MemoryLocation::getForSource(MDep),
+                       MSSA->getMemoryAccess(MDep), MSSA->getMemoryAccess(&CB)))
+      return false;
+  } else {
+    // NOTE: This is conservative, it will stop on any read from the source loc,
+    // not just the defining memcpy.
+    MemDepResult SourceDep = MD->getPointerDependencyFrom(
+        MemoryLocation::getForSource(MDep), false,
+        CB.getIterator(), MDep->getParent());
+    if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
+      return false;
+  }
 
   Value *TmpCast = MDep->getSource();
   if (MDep->getSource()->getType() != ByValArg->getType()) {
@@ -1484,7 +1671,8 @@
 }
 
 PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) {
-  auto &MD = AM.getResult<MemoryDependenceAnalysis>(F);
+  auto *MD = !EnableMemorySSA ? &AM.getResult<MemoryDependenceAnalysis>(F)
+                              : AM.getCachedResult<MemoryDependenceAnalysis>(F);
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto *AA = &AM.getResult<AAManager>(F);
   auto *AC = &AM.getResult<AssumptionAnalysis>(F);
@@ -1493,14 +1681,15 @@
                                : AM.getCachedResult<MemorySSAAnalysis>(F);
 
   bool MadeChange =
-      runImpl(F, &MD, &TLI, AA, AC, DT, MSSA ? &MSSA->getMSSA() : nullptr);
+      runImpl(F, MD, &TLI, AA, AC, DT, MSSA ? &MSSA->getMSSA() : nullptr);
   if (!MadeChange)
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
   PA.preserveSet<CFGAnalyses>();
   PA.preserve<GlobalsAA>();
-  PA.preserve<MemoryDependenceAnalysis>();
+  if (MD)
+    PA.preserve<MemoryDependenceAnalysis>();
   if (MSSA)
     PA.preserve<MemorySSAAnalysis>();
   return PA;
@@ -1516,6 +1705,7 @@
   AA = AA_;
   AC = AC_;
   DT = DT_;
+  MSSA = MSSA_;
   MemorySSAUpdater MSSAU_(MSSA_);
   MSSAU = MSSA_ ? &MSSAU_ : nullptr;
   // If we don't have at least memset and memcpy, there is little point of doing
@@ -1542,7 +1732,9 @@
   if (skipFunction(F))
     return false;
 
-  auto *MD = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
+  auto *MDWP = !EnableMemorySSA
+      ? &getAnalysis<MemoryDependenceWrapperPass>()
+      : getAnalysisIfAvailable<MemoryDependenceWrapperPass>();
   auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
@@ -1551,6 +1743,6 @@
       ? &getAnalysis<MemorySSAWrapperPass>()
       : getAnalysisIfAvailable<MemorySSAWrapperPass>();
 
-  return Impl.runImpl(F, MD, TLI, AA, AC, DT,
+  return Impl.runImpl(F, MDWP ? & MDWP->getMemDep() : nullptr, TLI, AA, AC, DT,
                       MSSAWP ? &MSSAWP->getMSSA() : nullptr);
 }
diff --git a/llvm/test/Transforms/MemCpyOpt/callslot.ll b/llvm/test/Transforms/MemCpyOpt/callslot.ll
--- a/llvm/test/Transforms/MemCpyOpt/callslot.ll
+++ b/llvm/test/Transforms/MemCpyOpt/callslot.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -memcpyopt < %s -enable-memcpyopt-memoryssa=0 | FileCheck %s
-; RUN: opt -S -memcpyopt < %s -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s
+; RUN: opt -S -memcpyopt < %s -enable-memcpyopt-memoryssa=0 | FileCheck %s --check-prefixes=CHECK,NO_MSSA
+; RUN: opt -S -memcpyopt < %s -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s --check-prefixes=CHECK,MSSA
 
 define i8 @read_dest_between_call_and_memcpy() {
 ; CHECK-LABEL: @read_dest_between_call_and_memcpy(
@@ -26,15 +26,25 @@
 }
 
 define i8 @read_src_between_call_and_memcpy() {
-; CHECK-LABEL: @read_src_between_call_and_memcpy(
-; CHECK-NEXT:    [[DEST:%.*]] = alloca [16 x i8], align 1
-; CHECK-NEXT:    [[SRC:%.*]] = alloca [16 x i8], align 1
-; CHECK-NEXT:    [[DEST_I8:%.*]] = bitcast [16 x i8]* [[DEST]] to i8*
-; CHECK-NEXT:    [[SRC_I8:%.*]] = bitcast [16 x i8]* [[SRC]] to i8*
-; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[SRC_I8]], i8 0, i64 16, i1 false)
-; CHECK-NEXT:    [[X:%.*]] = load i8, i8* [[SRC_I8]], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DEST_I8]], i8* [[SRC_I8]], i64 16, i1 false)
-; CHECK-NEXT:    ret i8 [[X]]
+; NO_MSSA-LABEL: @read_src_between_call_and_memcpy(
+; NO_MSSA-NEXT:    [[DEST:%.*]] = alloca [16 x i8], align 1
+; NO_MSSA-NEXT:    [[SRC:%.*]] = alloca [16 x i8], align 1
+; NO_MSSA-NEXT:    [[DEST_I8:%.*]] = bitcast [16 x i8]* [[DEST]] to i8*
+; NO_MSSA-NEXT:    [[SRC_I8:%.*]] = bitcast [16 x i8]* [[SRC]] to i8*
+; NO_MSSA-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[SRC_I8]], i8 0, i64 16, i1 false)
+; NO_MSSA-NEXT:    [[X:%.*]] = load i8, i8* [[SRC_I8]], align 1
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DEST_I8]], i8* [[SRC_I8]], i64 16, i1 false)
+; NO_MSSA-NEXT:    ret i8 [[X]]
+;
+; MSSA-LABEL: @read_src_between_call_and_memcpy(
+; MSSA-NEXT:    [[DEST:%.*]] = alloca [16 x i8], align 1
+; MSSA-NEXT:    [[SRC:%.*]] = alloca [16 x i8], align 1
+; MSSA-NEXT:    [[DEST_I8:%.*]] = bitcast [16 x i8]* [[DEST]] to i8*
+; MSSA-NEXT:    [[SRC_I8:%.*]] = bitcast [16 x i8]* [[SRC]] to i8*
+; MSSA-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[SRC_I8]], i8 0, i64 16, i1 false)
+; MSSA-NEXT:    [[X:%.*]] = load i8, i8* [[SRC_I8]], align 1
+; MSSA-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[DEST_I8]], i8 0, i64 16, i1 false)
+; MSSA-NEXT:    ret i8 [[X]]
 ;
   %dest = alloca [16 x i8]
   %src = alloca [16 x i8]
diff --git a/llvm/test/Transforms/MemCpyOpt/invariant.start.ll b/llvm/test/Transforms/MemCpyOpt/invariant.start.ll
--- a/llvm/test/Transforms/MemCpyOpt/invariant.start.ll
+++ b/llvm/test/Transforms/MemCpyOpt/invariant.start.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; MemCpy optimizations should take place even in presence of invariant.start
-; RUN: opt < %s -basic-aa -memcpyopt -S -enable-memcpyopt-memoryssa=0 | FileCheck %s
-; RUN: opt < %s -basic-aa -memcpyopt -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s
+; RUN: opt < %s -basic-aa -memcpyopt -S -enable-memcpyopt-memoryssa=0 | FileCheck %s --check-prefixes=CHECK,NO_MSSA
+; RUN: opt < %s -basic-aa -memcpyopt -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s --check-prefixes=CHECK,MSSA
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 
@@ -18,13 +18,21 @@
 ; The intermediate alloca and one of the memcpy's should be eliminated, the
 ; other should be transformed to a memmove.
 define void @test1(i8* %P, i8* %Q) nounwind  {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:    [[MEMTMP:%.*]] = alloca [[TMP0:%.*]], align 16
-; CHECK-NEXT:    [[R:%.*]] = bitcast %0* [[MEMTMP]] to i8*
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[R]], i8* align 16 [[P:%.*]], i32 32, i1 false)
-; CHECK-NEXT:    [[I:%.*]] = call {}* @llvm.invariant.start.p0i8(i64 32, i8* [[P]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[Q:%.*]], i8* align 16 [[R]], i32 32, i1 false)
-; CHECK-NEXT:    ret void
+; NO_MSSA-LABEL: @test1(
+; NO_MSSA-NEXT:    [[MEMTMP:%.*]] = alloca [[TMP0:%.*]], align 16
+; NO_MSSA-NEXT:    [[R:%.*]] = bitcast %0* [[MEMTMP]] to i8*
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[R]], i8* align 16 [[P:%.*]], i32 32, i1 false)
+; NO_MSSA-NEXT:    [[I:%.*]] = call {}* @llvm.invariant.start.p0i8(i64 32, i8* [[P]])
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[Q:%.*]], i8* align 16 [[R]], i32 32, i1 false)
+; NO_MSSA-NEXT:    ret void
+;
+; MSSA-LABEL: @test1(
+; MSSA-NEXT:    [[MEMTMP:%.*]] = alloca [[TMP0:%.*]], align 16
+; MSSA-NEXT:    [[R:%.*]] = bitcast %0* [[MEMTMP]] to i8*
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[R]], i8* align 16 [[P:%.*]], i32 32, i1 false)
+; MSSA-NEXT:    [[I:%.*]] = call {}* @llvm.invariant.start.p0i8(i64 32, i8* [[P]])
+; MSSA-NEXT:    call void @llvm.memmove.p0i8.p0i8.i32(i8* align 16 [[Q:%.*]], i8* align 16 [[P]], i32 32, i1 false)
+; MSSA-NEXT:    ret void
 ;
   %memtmp = alloca %0, align 16
   %R = bitcast %0* %memtmp to i8*
diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-invoke-memcpy.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-invoke-memcpy.ll
--- a/llvm/test/Transforms/MemCpyOpt/memcpy-invoke-memcpy.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy-invoke-memcpy.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=0 | FileCheck %s
-; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s
+; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=0 | FileCheck %s --check-prefixes=CHECK,NO_MSSA
+; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s --check-prefixes=CHECK,MSSA
 
 ; Test memcpy-memcpy dependencies across invoke edges.
 
@@ -8,19 +8,33 @@
 ; TODO: Not supported yet.
 
 define hidden void @test_normal(i8* noalias %dst, i8* %src) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
-; CHECK-LABEL: @test_normal(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TEMP:%.*]] = alloca i8, i32 64, align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TEMP]], i8* nonnull align 8 [[SRC:%.*]], i64 64, i1 false)
-; CHECK-NEXT:    invoke void @invoke_me()
-; CHECK-NEXT:    to label [[TRY_CONT:%.*]] unwind label [[LPAD:%.*]]
-; CHECK:       lpad:
-; CHECK-NEXT:    [[TMP0:%.*]] = landingpad { i8*, i32 }
-; CHECK-NEXT:    catch i8* null
-; CHECK-NEXT:    ret void
-; CHECK:       try.cont:
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[DST:%.*]], i8* align 8 [[TEMP]], i64 64, i1 false)
-; CHECK-NEXT:    ret void
+; NO_MSSA-LABEL: @test_normal(
+; NO_MSSA-NEXT:  entry:
+; NO_MSSA-NEXT:    [[TEMP:%.*]] = alloca i8, i32 64, align 1
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TEMP]], i8* nonnull align 8 [[SRC:%.*]], i64 64, i1 false)
+; NO_MSSA-NEXT:    invoke void @invoke_me()
+; NO_MSSA-NEXT:    to label [[TRY_CONT:%.*]] unwind label [[LPAD:%.*]]
+; NO_MSSA:       lpad:
+; NO_MSSA-NEXT:    [[TMP0:%.*]] = landingpad { i8*, i32 }
+; NO_MSSA-NEXT:    catch i8* null
+; NO_MSSA-NEXT:    ret void
+; NO_MSSA:       try.cont:
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[DST:%.*]], i8* align 8 [[TEMP]], i64 64, i1 false)
+; NO_MSSA-NEXT:    ret void
+;
+; MSSA-LABEL: @test_normal(
+; MSSA-NEXT:  entry:
+; MSSA-NEXT:    [[TEMP:%.*]] = alloca i8, i32 64, align 1
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TEMP]], i8* nonnull align 8 [[SRC:%.*]], i64 64, i1 false)
+; MSSA-NEXT:    invoke void @invoke_me()
+; MSSA-NEXT:    to label [[TRY_CONT:%.*]] unwind label [[LPAD:%.*]]
+; MSSA:       lpad:
+; MSSA-NEXT:    [[TMP0:%.*]] = landingpad { i8*, i32 }
+; MSSA-NEXT:    catch i8* null
+; MSSA-NEXT:    ret void
+; MSSA:       try.cont:
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[DST:%.*]], i8* align 8 [[SRC]], i64 64, i1 false)
+; MSSA-NEXT:    ret void
 ;
 entry:
   %temp = alloca i8, i32 64
@@ -42,19 +56,33 @@
 ; TODO: Not supported yet.
 
 define hidden void @test_unwind(i8* noalias %dst, i8* %src) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
-; CHECK-LABEL: @test_unwind(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TEMP:%.*]] = alloca i8, i32 64, align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TEMP]], i8* nonnull align 8 [[SRC:%.*]], i64 64, i1 false)
-; CHECK-NEXT:    invoke void @invoke_me()
-; CHECK-NEXT:    to label [[TRY_CONT:%.*]] unwind label [[LPAD:%.*]]
-; CHECK:       lpad:
-; CHECK-NEXT:    [[TMP0:%.*]] = landingpad { i8*, i32 }
-; CHECK-NEXT:    catch i8* null
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[DST:%.*]], i8* align 8 [[TEMP]], i64 64, i1 false)
-; CHECK-NEXT:    ret void
-; CHECK:       try.cont:
-; CHECK-NEXT:    ret void
+; NO_MSSA-LABEL: @test_unwind(
+; NO_MSSA-NEXT:  entry:
+; NO_MSSA-NEXT:    [[TEMP:%.*]] = alloca i8, i32 64, align 1
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TEMP]], i8* nonnull align 8 [[SRC:%.*]], i64 64, i1 false)
+; NO_MSSA-NEXT:    invoke void @invoke_me()
+; NO_MSSA-NEXT:    to label [[TRY_CONT:%.*]] unwind label [[LPAD:%.*]]
+; NO_MSSA:       lpad:
+; NO_MSSA-NEXT:    [[TMP0:%.*]] = landingpad { i8*, i32 }
+; NO_MSSA-NEXT:    catch i8* null
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[DST:%.*]], i8* align 8 [[TEMP]], i64 64, i1 false)
+; NO_MSSA-NEXT:    ret void
+; NO_MSSA:       try.cont:
+; NO_MSSA-NEXT:    ret void
+;
+; MSSA-LABEL: @test_unwind(
+; MSSA-NEXT:  entry:
+; MSSA-NEXT:    [[TEMP:%.*]] = alloca i8, i32 64, align 1
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TEMP]], i8* nonnull align 8 [[SRC:%.*]], i64 64, i1 false)
+; MSSA-NEXT:    invoke void @invoke_me()
+; MSSA-NEXT:    to label [[TRY_CONT:%.*]] unwind label [[LPAD:%.*]]
+; MSSA:       lpad:
+; MSSA-NEXT:    [[TMP0:%.*]] = landingpad { i8*, i32 }
+; MSSA-NEXT:    catch i8* null
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[DST:%.*]], i8* align 8 [[SRC]], i64 64, i1 false)
+; MSSA-NEXT:    ret void
+; MSSA:       try.cont:
+; MSSA-NEXT:    ret void
 ;
 entry:
   %temp = alloca i8, i32 64
diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy.ll b/llvm/test/Transforms/MemCpyOpt/memcpy.ll
--- a/llvm/test/Transforms/MemCpyOpt/memcpy.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy.ll
@@ -141,13 +141,21 @@
 }
 
 define i8 @test4_read_between(i8 *%P) {
-; CHECK-LABEL: @test4_read_between(
-; CHECK-NEXT:    [[A1:%.*]] = alloca [[TMP1:%.*]], align 8
-; CHECK-NEXT:    [[A2:%.*]] = bitcast %1* [[A1]] to i8*
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[A2]], i8* align 4 [[P:%.*]], i64 8, i1 false)
-; CHECK-NEXT:    [[X:%.*]] = load i8, i8* [[A2]], align 1
-; CHECK-NEXT:    call void @test4a(i8* byval(i8) align 1 [[A2]])
-; CHECK-NEXT:    ret i8 [[X]]
+; NO_MSSA-LABEL: @test4_read_between(
+; NO_MSSA-NEXT:    [[A1:%.*]] = alloca [[TMP1:%.*]], align 8
+; NO_MSSA-NEXT:    [[A2:%.*]] = bitcast %1* [[A1]] to i8*
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[A2]], i8* align 4 [[P:%.*]], i64 8, i1 false)
+; NO_MSSA-NEXT:    [[X:%.*]] = load i8, i8* [[A2]], align 1
+; NO_MSSA-NEXT:    call void @test4a(i8* byval align 1 [[A2]])
+; NO_MSSA-NEXT:    ret i8 [[X]]
+;
+; MSSA-LABEL: @test4_read_between(
+; MSSA-NEXT:    [[A1:%.*]] = alloca [[TMP1:%.*]], align 8
+; MSSA-NEXT:    [[A2:%.*]] = bitcast %1* [[A1]] to i8*
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[A2]], i8* align 4 [[P:%.*]], i64 8, i1 false)
+; MSSA-NEXT:    [[X:%.*]] = load i8, i8* [[A2]], align 1
+; MSSA-NEXT:    call void @test4a(i8* byval align 1 [[P]])
+; MSSA-NEXT:    ret i8 [[X]]
 ;
   %a1 = alloca %1
   %a2 = bitcast %1* %a1 to i8*
@@ -158,16 +166,27 @@
 }
 
 define void @test4_non_local(i8 *%P, i1 %c) {
-; CHECK-LABEL: @test4_non_local(
-; CHECK-NEXT:    [[A1:%.*]] = alloca [[TMP1:%.*]], align 8
-; CHECK-NEXT:    [[A2:%.*]] = bitcast %1* [[A1]] to i8*
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[A2]], i8* align 4 [[P:%.*]], i64 8, i1 false)
-; CHECK-NEXT:    br i1 [[C:%.*]], label [[CALL:%.*]], label [[EXIT:%.*]]
-; CHECK:       call:
-; CHECK-NEXT:    call void @test4a(i8* byval(i8) align 1 [[A2]])
-; CHECK-NEXT:    br label [[EXIT]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
+; NO_MSSA-LABEL: @test4_non_local(
+; NO_MSSA-NEXT:    [[A1:%.*]] = alloca [[TMP1:%.*]], align 8
+; NO_MSSA-NEXT:    [[A2:%.*]] = bitcast %1* [[A1]] to i8*
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[A2]], i8* align 4 [[P:%.*]], i64 8, i1 false)
+; NO_MSSA-NEXT:    br i1 [[C:%.*]], label [[CALL:%.*]], label [[EXIT:%.*]]
+; NO_MSSA:       call:
+; NO_MSSA-NEXT:    call void @test4a(i8* byval align 1 [[A2]])
+; NO_MSSA-NEXT:    br label [[EXIT]]
+; NO_MSSA:       exit:
+; NO_MSSA-NEXT:    ret void
+;
+; MSSA-LABEL: @test4_non_local(
+; MSSA-NEXT:    [[A1:%.*]] = alloca [[TMP1:%.*]], align 8
+; MSSA-NEXT:    [[A2:%.*]] = bitcast %1* [[A1]] to i8*
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[A2]], i8* align 4 [[P:%.*]], i64 8, i1 false)
+; MSSA-NEXT:    br i1 [[C:%.*]], label [[CALL:%.*]], label [[EXIT:%.*]]
+; MSSA:       call:
+; MSSA-NEXT:    call void @test4a(i8* byval align 1 [[P]])
+; MSSA-NEXT:    br label [[EXIT]]
+; MSSA:       exit:
+; MSSA-NEXT:    ret void
 ;
   %a1 = alloca %1
   %a2 = bitcast %1* %a1 to i8*
diff --git a/llvm/test/Transforms/MemCpyOpt/merge-into-memset.ll b/llvm/test/Transforms/MemCpyOpt/merge-into-memset.ll
--- a/llvm/test/Transforms/MemCpyOpt/merge-into-memset.ll
+++ b/llvm/test/Transforms/MemCpyOpt/merge-into-memset.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=0 | FileCheck %s
-; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s
+; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=0 | FileCheck %s --check-prefix=NO_MSSA
+; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s --check-prefix=MSSA
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -8,19 +8,33 @@
 ; which will be deleted.
 
 define void @foo(i1 %c, i8* %d, i8* %e, i8* %f) {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP:%.*]] = alloca [50 x i8], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast [50 x i8]* [[TMP]] to i8*
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i64 1
-; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* nonnull [[D:%.*]], i8 0, i64 10, i1 false)
-; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP4]], i8 0, i64 11, i1 false)
-; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[EXIT:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[F:%.*]], i8* nonnull align 8 [[TMP4]], i64 30, i1 false)
-; CHECK-NEXT:    br label [[EXIT]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
+; NO_MSSA-LABEL: @foo(
+; NO_MSSA-NEXT:  entry:
+; NO_MSSA-NEXT:    [[TMP:%.*]] = alloca [50 x i8], align 8
+; NO_MSSA-NEXT:    [[TMP4:%.*]] = bitcast [50 x i8]* [[TMP]] to i8*
+; NO_MSSA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i64 1
+; NO_MSSA-NEXT:    call void @llvm.memset.p0i8.i64(i8* nonnull [[D:%.*]], i8 0, i64 10, i1 false)
+; NO_MSSA-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP4]], i8 0, i64 11, i1 false)
+; NO_MSSA-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[EXIT:%.*]]
+; NO_MSSA:       if.then:
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[F:%.*]], i8* nonnull align 8 [[TMP4]], i64 30, i1 false)
+; NO_MSSA-NEXT:    br label [[EXIT]]
+; NO_MSSA:       exit:
+; NO_MSSA-NEXT:    ret void
+;
+; MSSA-LABEL: @foo(
+; MSSA-NEXT:  entry:
+; MSSA-NEXT:    [[TMP:%.*]] = alloca [50 x i8], align 8
+; MSSA-NEXT:    [[TMP4:%.*]] = bitcast [50 x i8]* [[TMP]] to i8*
+; MSSA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i64 1
+; MSSA-NEXT:    call void @llvm.memset.p0i8.i64(i8* nonnull [[D:%.*]], i8 0, i64 10, i1 false)
+; MSSA-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP4]], i8 0, i64 11, i1 false)
+; MSSA-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[EXIT:%.*]]
+; MSSA:       if.then:
+; MSSA-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[F:%.*]], i8 0, i64 11, i1 false)
+; MSSA-NEXT:    br label [[EXIT]]
+; MSSA:       exit:
+; MSSA-NEXT:    ret void
 ;
 entry:
   %tmp = alloca [50 x i8], align 8
diff --git a/llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll b/llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll
--- a/llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll
+++ b/llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=0 | FileCheck %s
-; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s
+; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=0 | FileCheck %s --check-prefix=NO_MSSA
+; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s --check-prefix=MSSA
 ; Handle memcpy-memcpy dependencies of differing sizes correctly.
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -9,25 +9,44 @@
 ; memcpy with a larger size from the same address.
 
 define i32 @foo(i1 %z) {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A:%.*]] = alloca [10 x i32], align 4
-; CHECK-NEXT:    [[S:%.*]] = alloca [10 x i32], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast [10 x i32]* [[A]] to i8*
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[S]] to i8*
-; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* nonnull align 16 [[TMP1]], i8 0, i64 40, i1 false)
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[A]], i64 0, i64 0
-; CHECK-NEXT:    store i32 1, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr [10 x i32], [10 x i32]* [[S]], i64 0, i64 1
-; CHECK-NEXT:    [[SCEVGEP7:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    br i1 [[Z:%.*]], label [[FOR_BODY3_LR_PH:%.*]], label [[FOR_INC7_1:%.*]]
-; CHECK:       for.body3.lr.ph:
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP0]], i8* align 4 [[SCEVGEP7]], i64 17179869180, i1 false)
-; CHECK-NEXT:    br label [[FOR_INC7_1]]
-; CHECK:       for.inc7.1:
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP0]], i8* align 4 [[SCEVGEP7]], i64 4, i1 false)
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    ret i32 [[TMP2]]
+; NO_MSSA-LABEL: @foo(
+; NO_MSSA-NEXT:  entry:
+; NO_MSSA-NEXT:    [[A:%.*]] = alloca [10 x i32], align 4
+; NO_MSSA-NEXT:    [[S:%.*]] = alloca [10 x i32], align 4
+; NO_MSSA-NEXT:    [[TMP0:%.*]] = bitcast [10 x i32]* [[A]] to i8*
+; NO_MSSA-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[S]] to i8*
+; NO_MSSA-NEXT:    call void @llvm.memset.p0i8.i64(i8* nonnull align 16 [[TMP1]], i8 0, i64 40, i1 false)
+; NO_MSSA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[A]], i64 0, i64 0
+; NO_MSSA-NEXT:    store i32 1, i32* [[ARRAYIDX]], align 4
+; NO_MSSA-NEXT:    [[SCEVGEP:%.*]] = getelementptr [10 x i32], [10 x i32]* [[S]], i64 0, i64 1
+; NO_MSSA-NEXT:    [[SCEVGEP7:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
+; NO_MSSA-NEXT:    br i1 [[Z:%.*]], label [[FOR_BODY3_LR_PH:%.*]], label [[FOR_INC7_1:%.*]]
+; NO_MSSA:       for.body3.lr.ph:
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP0]], i8* align 4 [[SCEVGEP7]], i64 17179869180, i1 false)
+; NO_MSSA-NEXT:    br label [[FOR_INC7_1]]
+; NO_MSSA:       for.inc7.1:
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP0]], i8* align 4 [[SCEVGEP7]], i64 4, i1 false)
+; NO_MSSA-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; NO_MSSA-NEXT:    ret i32 [[TMP2]]
+;
+; MSSA-LABEL: @foo(
+; MSSA-NEXT:  entry:
+; MSSA-NEXT:    [[A:%.*]] = alloca [10 x i32], align 4
+; MSSA-NEXT:    [[S:%.*]] = alloca [10 x i32], align 4
+; MSSA-NEXT:    [[TMP0:%.*]] = bitcast [10 x i32]* [[A]] to i8*
+; MSSA-NEXT:    [[TMP1:%.*]] = bitcast [10 x i32]* [[S]] to i8*
+; MSSA-NEXT:    call void @llvm.memset.p0i8.i64(i8* nonnull align 16 [[TMP1]], i8 0, i64 40, i1 false)
+; MSSA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[A]], i64 0, i64 0
+; MSSA-NEXT:    store i32 1, i32* [[ARRAYIDX]], align 4
+; MSSA-NEXT:    [[SCEVGEP:%.*]] = getelementptr [10 x i32], [10 x i32]* [[S]], i64 0, i64 1
+; MSSA-NEXT:    [[SCEVGEP7:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
+; MSSA-NEXT:    br i1 [[Z:%.*]], label [[FOR_BODY3_LR_PH:%.*]], label [[FOR_INC7_1:%.*]]
+; MSSA:       for.body3.lr.ph:
+; MSSA-NEXT:    br label [[FOR_INC7_1]]
+; MSSA:       for.inc7.1:
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP0]], i8* align 4 [[SCEVGEP7]], i64 4, i1 false)
+; MSSA-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; MSSA-NEXT:    ret i32 [[TMP2]]
 ;
 entry:
   %a = alloca [10 x i32]
diff --git a/llvm/test/Transforms/MemCpyOpt/nonlocal-memcpy-memcpy.ll b/llvm/test/Transforms/MemCpyOpt/nonlocal-memcpy-memcpy.ll
--- a/llvm/test/Transforms/MemCpyOpt/nonlocal-memcpy-memcpy.ll
+++ b/llvm/test/Transforms/MemCpyOpt/nonlocal-memcpy-memcpy.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=0 | FileCheck %s
-; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s
+; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=0 | FileCheck %s --check-prefix=NO_MSSA
+; RUN: opt < %s -memcpyopt -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s --check-prefix=MSSA
 
 ; Test whether memcpy-memcpy dependence is optimized across
 ; basic blocks (conditional branches and invokes).
@@ -22,17 +22,29 @@
 ; to copy directly from the original source rather than from the temporary.
 
 define void @wobble(i8* noalias %dst, i8* %src, i1 %some_condition) {
-; CHECK-LABEL: @wobble(
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TEMP:%.*]] = alloca i8, i32 64, align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TEMP]], i8* nonnull align 8 [[SRC:%.*]], i64 64, i1 false)
-; CHECK-NEXT:    br i1 [[SOME_CONDITION:%.*]], label [[MORE:%.*]], label [[OUT:%.*]]
-; CHECK:       out:
-; CHECK-NEXT:    call void @qux()
-; CHECK-NEXT:    unreachable
-; CHECK:       more:
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[DST:%.*]], i8* align 8 [[TEMP]], i64 64, i1 false)
-; CHECK-NEXT:    ret void
+; NO_MSSA-LABEL: @wobble(
+; NO_MSSA-NEXT:  bb:
+; NO_MSSA-NEXT:    [[TEMP:%.*]] = alloca i8, i32 64, align 1
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TEMP]], i8* nonnull align 8 [[SRC:%.*]], i64 64, i1 false)
+; NO_MSSA-NEXT:    br i1 [[SOME_CONDITION:%.*]], label [[MORE:%.*]], label [[OUT:%.*]]
+; NO_MSSA:       out:
+; NO_MSSA-NEXT:    call void @qux()
+; NO_MSSA-NEXT:    unreachable
+; NO_MSSA:       more:
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[DST:%.*]], i8* align 8 [[TEMP]], i64 64, i1 false)
+; NO_MSSA-NEXT:    ret void
+;
+; MSSA-LABEL: @wobble(
+; MSSA-NEXT:  bb:
+; MSSA-NEXT:    [[TEMP:%.*]] = alloca i8, i32 64, align 1
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TEMP]], i8* nonnull align 8 [[SRC:%.*]], i64 64, i1 false)
+; MSSA-NEXT:    br i1 [[SOME_CONDITION:%.*]], label [[MORE:%.*]], label [[OUT:%.*]]
+; MSSA:       out:
+; MSSA-NEXT:    call void @qux()
+; MSSA-NEXT:    unreachable
+; MSSA:       more:
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[DST:%.*]], i8* align 8 [[SRC]], i64 64, i1 false)
+; MSSA-NEXT:    ret void
 ;
 bb:
   %temp = alloca i8, i32 64
@@ -53,25 +65,45 @@
 ; source rather than from the temporary.
 
 define i32 @foo(i1 %t3) {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[S:%.*]] = alloca [[STRUCT_S:%.*]], align 4
-; CHECK-NEXT:    [[T:%.*]] = alloca [[STRUCT_S]], align 4
-; CHECK-NEXT:    [[S1:%.*]] = bitcast %struct.s* [[S]] to i8*
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[S1]], i8* align 4 bitcast (%struct.s* @s_foo to i8*), i64 8, i1 false)
-; CHECK-NEXT:    br i1 [[T3:%.*]], label [[BB4:%.*]], label [[BB7:%.*]]
-; CHECK:       bb4:
-; CHECK-NEXT:    [[T5:%.*]] = bitcast %struct.s* [[T]] to i8*
-; CHECK-NEXT:    [[S6:%.*]] = bitcast %struct.s* [[S]] to i8*
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[T5]], i8* align 4 [[S6]], i64 8, i1 false)
-; CHECK-NEXT:    br label [[BB7]]
-; CHECK:       bb7:
-; CHECK-NEXT:    [[T8:%.*]] = getelementptr [[STRUCT_S]], %struct.s* [[T]], i32 0, i32 0
-; CHECK-NEXT:    [[T9:%.*]] = load i32, i32* [[T8]], align 4
-; CHECK-NEXT:    [[T10:%.*]] = getelementptr [[STRUCT_S]], %struct.s* [[T]], i32 0, i32 1
-; CHECK-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4
-; CHECK-NEXT:    [[T12:%.*]] = add i32 [[T9]], [[T11]]
-; CHECK-NEXT:    ret i32 [[T12]]
+; NO_MSSA-LABEL: @foo(
+; NO_MSSA-NEXT:  bb:
+; NO_MSSA-NEXT:    [[S:%.*]] = alloca [[STRUCT_S:%.*]], align 4
+; NO_MSSA-NEXT:    [[T:%.*]] = alloca [[STRUCT_S]], align 4
+; NO_MSSA-NEXT:    [[S1:%.*]] = bitcast %struct.s* [[S]] to i8*
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[S1]], i8* align 4 bitcast (%struct.s* @s_foo to i8*), i64 8, i1 false)
+; NO_MSSA-NEXT:    br i1 [[T3:%.*]], label [[BB4:%.*]], label [[BB7:%.*]]
+; NO_MSSA:       bb4:
+; NO_MSSA-NEXT:    [[T5:%.*]] = bitcast %struct.s* [[T]] to i8*
+; NO_MSSA-NEXT:    [[S6:%.*]] = bitcast %struct.s* [[S]] to i8*
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[T5]], i8* align 4 [[S6]], i64 8, i1 false)
+; NO_MSSA-NEXT:    br label [[BB7]]
+; NO_MSSA:       bb7:
+; NO_MSSA-NEXT:    [[T8:%.*]] = getelementptr [[STRUCT_S]], %struct.s* [[T]], i32 0, i32 0
+; NO_MSSA-NEXT:    [[T9:%.*]] = load i32, i32* [[T8]], align 4
+; NO_MSSA-NEXT:    [[T10:%.*]] = getelementptr [[STRUCT_S]], %struct.s* [[T]], i32 0, i32 1
+; NO_MSSA-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4
+; NO_MSSA-NEXT:    [[T12:%.*]] = add i32 [[T9]], [[T11]]
+; NO_MSSA-NEXT:    ret i32 [[T12]]
+;
+; MSSA-LABEL: @foo(
+; MSSA-NEXT:  bb:
+; MSSA-NEXT:    [[S:%.*]] = alloca [[STRUCT_S:%.*]], align 4
+; MSSA-NEXT:    [[T:%.*]] = alloca [[STRUCT_S]], align 4
+; MSSA-NEXT:    [[S1:%.*]] = bitcast %struct.s* [[S]] to i8*
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[S1]], i8* align 4 bitcast (%struct.s* @s_foo to i8*), i64 8, i1 false)
+; MSSA-NEXT:    br i1 [[T3:%.*]], label [[BB4:%.*]], label [[BB7:%.*]]
+; MSSA:       bb4:
+; MSSA-NEXT:    [[T5:%.*]] = bitcast %struct.s* [[T]] to i8*
+; MSSA-NEXT:    [[S6:%.*]] = bitcast %struct.s* [[S]] to i8*
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[T5]], i8* align 4 bitcast (%struct.s* @s_foo to i8*), i64 8, i1 false)
+; MSSA-NEXT:    br label [[BB7]]
+; MSSA:       bb7:
+; MSSA-NEXT:    [[T8:%.*]] = getelementptr [[STRUCT_S]], %struct.s* [[T]], i32 0, i32 0
+; MSSA-NEXT:    [[T9:%.*]] = load i32, i32* [[T8]], align 4
+; MSSA-NEXT:    [[T10:%.*]] = getelementptr [[STRUCT_S]], %struct.s* [[T]], i32 0, i32 1
+; MSSA-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4
+; MSSA-NEXT:    [[T12:%.*]] = add i32 [[T9]], [[T11]]
+; MSSA-NEXT:    ret i32 [[T12]]
 ;
 bb:
   %s = alloca %struct.s, align 4
@@ -102,37 +134,69 @@
 ; pattern.
 
 define i32 @baz(i1 %t5) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
-; CHECK-LABEL: @baz(
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[S:%.*]] = alloca [[STRUCT_S:%.*]], align 4
-; CHECK-NEXT:    [[T:%.*]] = alloca [[STRUCT_S]], align 4
-; CHECK-NEXT:    [[S3:%.*]] = bitcast %struct.s* [[S]] to i8*
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[S3]], i8* align 4 bitcast (%struct.s* @s_baz to i8*), i64 8, i1 false)
-; CHECK-NEXT:    br i1 [[T5:%.*]], label [[BB6:%.*]], label [[BB22:%.*]]
-; CHECK:       bb6:
-; CHECK-NEXT:    invoke void @__cxa_throw(i8* null, i8* bitcast (i8** @i to i8*), i8* null)
-; CHECK-NEXT:    to label [[BB25:%.*]] unwind label [[BB9:%.*]]
-; CHECK:       bb9:
-; CHECK-NEXT:    [[T10:%.*]] = landingpad { i8*, i32 }
-; CHECK-NEXT:    catch i8* null
-; CHECK-NEXT:    br label [[BB13:%.*]]
-; CHECK:       bb13:
-; CHECK-NEXT:    [[T15:%.*]] = call i8* @__cxa_begin_catch(i8* null)
-; CHECK-NEXT:    br label [[BB23:%.*]]
-; CHECK:       bb22:
-; CHECK-NEXT:    [[T23:%.*]] = bitcast %struct.s* [[T]] to i8*
-; CHECK-NEXT:    [[S24:%.*]] = bitcast %struct.s* [[S]] to i8*
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[T23]], i8* align 4 [[S24]], i64 8, i1 false)
-; CHECK-NEXT:    br label [[BB23]]
-; CHECK:       bb23:
-; CHECK-NEXT:    [[T17:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[T]], i32 0, i32 0
-; CHECK-NEXT:    [[T18:%.*]] = load i32, i32* [[T17]], align 4
-; CHECK-NEXT:    [[T19:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[T]], i32 0, i32 1
-; CHECK-NEXT:    [[T20:%.*]] = load i32, i32* [[T19]], align 4
-; CHECK-NEXT:    [[T21:%.*]] = add nsw i32 [[T18]], [[T20]]
-; CHECK-NEXT:    ret i32 [[T21]]
-; CHECK:       bb25:
-; CHECK-NEXT:    unreachable
+; NO_MSSA-LABEL: @baz(
+; NO_MSSA-NEXT:  bb:
+; NO_MSSA-NEXT:    [[S:%.*]] = alloca [[STRUCT_S:%.*]], align 4
+; NO_MSSA-NEXT:    [[T:%.*]] = alloca [[STRUCT_S]], align 4
+; NO_MSSA-NEXT:    [[S3:%.*]] = bitcast %struct.s* [[S]] to i8*
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[S3]], i8* align 4 bitcast (%struct.s* @s_baz to i8*), i64 8, i1 false)
+; NO_MSSA-NEXT:    br i1 [[T5:%.*]], label [[BB6:%.*]], label [[BB22:%.*]]
+; NO_MSSA:       bb6:
+; NO_MSSA-NEXT:    invoke void @__cxa_throw(i8* null, i8* bitcast (i8** @i to i8*), i8* null)
+; NO_MSSA-NEXT:    to label [[BB25:%.*]] unwind label [[BB9:%.*]]
+; NO_MSSA:       bb9:
+; NO_MSSA-NEXT:    [[T10:%.*]] = landingpad { i8*, i32 }
+; NO_MSSA-NEXT:    catch i8* null
+; NO_MSSA-NEXT:    br label [[BB13:%.*]]
+; NO_MSSA:       bb13:
+; NO_MSSA-NEXT:    [[T15:%.*]] = call i8* @__cxa_begin_catch(i8* null)
+; NO_MSSA-NEXT:    br label [[BB23:%.*]]
+; NO_MSSA:       bb22:
+; NO_MSSA-NEXT:    [[T23:%.*]] = bitcast %struct.s* [[T]] to i8*
+; NO_MSSA-NEXT:    [[S24:%.*]] = bitcast %struct.s* [[S]] to i8*
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[T23]], i8* align 4 [[S24]], i64 8, i1 false)
+; NO_MSSA-NEXT:    br label [[BB23]]
+; NO_MSSA:       bb23:
+; NO_MSSA-NEXT:    [[T17:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[T]], i32 0, i32 0
+; NO_MSSA-NEXT:    [[T18:%.*]] = load i32, i32* [[T17]], align 4
+; NO_MSSA-NEXT:    [[T19:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[T]], i32 0, i32 1
+; NO_MSSA-NEXT:    [[T20:%.*]] = load i32, i32* [[T19]], align 4
+; NO_MSSA-NEXT:    [[T21:%.*]] = add nsw i32 [[T18]], [[T20]]
+; NO_MSSA-NEXT:    ret i32 [[T21]]
+; NO_MSSA:       bb25:
+; NO_MSSA-NEXT:    unreachable
+;
+; MSSA-LABEL: @baz(
+; MSSA-NEXT:  bb:
+; MSSA-NEXT:    [[S:%.*]] = alloca [[STRUCT_S:%.*]], align 4
+; MSSA-NEXT:    [[T:%.*]] = alloca [[STRUCT_S]], align 4
+; MSSA-NEXT:    [[S3:%.*]] = bitcast %struct.s* [[S]] to i8*
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[S3]], i8* align 4 bitcast (%struct.s* @s_baz to i8*), i64 8, i1 false)
+; MSSA-NEXT:    br i1 [[T5:%.*]], label [[BB6:%.*]], label [[BB22:%.*]]
+; MSSA:       bb6:
+; MSSA-NEXT:    invoke void @__cxa_throw(i8* null, i8* bitcast (i8** @i to i8*), i8* null)
+; MSSA-NEXT:    to label [[BB25:%.*]] unwind label [[BB9:%.*]]
+; MSSA:       bb9:
+; MSSA-NEXT:    [[T10:%.*]] = landingpad { i8*, i32 }
+; MSSA-NEXT:    catch i8* null
+; MSSA-NEXT:    br label [[BB13:%.*]]
+; MSSA:       bb13:
+; MSSA-NEXT:    [[T15:%.*]] = call i8* @__cxa_begin_catch(i8* null)
+; MSSA-NEXT:    br label [[BB23:%.*]]
+; MSSA:       bb22:
+; MSSA-NEXT:    [[T23:%.*]] = bitcast %struct.s* [[T]] to i8*
+; MSSA-NEXT:    [[S24:%.*]] = bitcast %struct.s* [[S]] to i8*
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[T23]], i8* align 4 bitcast (%struct.s* @s_baz to i8*), i64 8, i1 false)
+; MSSA-NEXT:    br label [[BB23]]
+; MSSA:       bb23:
+; MSSA-NEXT:    [[T17:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[T]], i32 0, i32 0
+; MSSA-NEXT:    [[T18:%.*]] = load i32, i32* [[T17]], align 4
+; MSSA-NEXT:    [[T19:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[T]], i32 0, i32 1
+; MSSA-NEXT:    [[T20:%.*]] = load i32, i32* [[T19]], align 4
+; MSSA-NEXT:    [[T21:%.*]] = add nsw i32 [[T18]], [[T20]]
+; MSSA-NEXT:    ret i32 [[T21]]
+; MSSA:       bb25:
+; MSSA-NEXT:    unreachable
 ;
 bb:
   %s = alloca %struct.s, align 4
diff --git a/llvm/test/Transforms/MemCpyOpt/stackrestore.ll b/llvm/test/Transforms/MemCpyOpt/stackrestore.ll
--- a/llvm/test/Transforms/MemCpyOpt/stackrestore.ll
+++ b/llvm/test/Transforms/MemCpyOpt/stackrestore.ll
@@ -16,19 +16,33 @@
 ; a call to @external.
 
 define i32 @test_norestore(i32 %n) {
-; CHECK-LABEL: @test_norestore(
-; CHECK-NEXT:    [[TMPMEM:%.*]] = alloca [10 x i8], align 4
-; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds [10 x i8], [10 x i8]* [[TMPMEM]], i32 0, i32 0
-; CHECK-NEXT:    [[P:%.*]] = alloca i8, i32 [[N:%.*]], align 4
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[P]], i8* align 1 getelementptr inbounds ([9 x i8], [9 x i8]* @str, i32 0, i32 0), i32 9, i1 false)
-; CHECK-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 9
-; CHECK-NEXT:    store i8 0, i8* [[P10]], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP]], i8* [[P]], i32 10, i1 false)
-; CHECK-NEXT:    call void @external()
-; CHECK-NEXT:    [[HEAP:%.*]] = call i8* @malloc(i32 9)
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[HEAP]], i8* [[P]], i32 9, i1 false)
-; CHECK-NEXT:    call void @useit(i8* [[HEAP]])
-; CHECK-NEXT:    ret i32 0
+; NO_MSSA-LABEL: @test_norestore(
+; NO_MSSA-NEXT:    [[TMPMEM:%.*]] = alloca [10 x i8], align 4
+; NO_MSSA-NEXT:    [[TMP:%.*]] = getelementptr inbounds [10 x i8], [10 x i8]* [[TMPMEM]], i32 0, i32 0
+; NO_MSSA-NEXT:    [[P:%.*]] = alloca i8, i32 [[N:%.*]], align 4
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[P]], i8* align 1 getelementptr inbounds ([9 x i8], [9 x i8]* @str, i32 0, i32 0), i32 9, i1 false)
+; NO_MSSA-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 9
+; NO_MSSA-NEXT:    store i8 0, i8* [[P10]], align 1
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP]], i8* [[P]], i32 10, i1 false)
+; NO_MSSA-NEXT:    call void @external()
+; NO_MSSA-NEXT:    [[HEAP:%.*]] = call i8* @malloc(i32 9)
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[HEAP]], i8* [[P]], i32 9, i1 false)
+; NO_MSSA-NEXT:    call void @useit(i8* [[HEAP]])
+; NO_MSSA-NEXT:    ret i32 0
+;
+; MSSA-LABEL: @test_norestore(
+; MSSA-NEXT:    [[TMPMEM:%.*]] = alloca [10 x i8], align 4
+; MSSA-NEXT:    [[TMP:%.*]] = getelementptr inbounds [10 x i8], [10 x i8]* [[TMPMEM]], i32 0, i32 0
+; MSSA-NEXT:    [[P:%.*]] = alloca i8, i32 [[N:%.*]], align 4
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[P]], i8* align 1 getelementptr inbounds ([9 x i8], [9 x i8]* @str, i32 0, i32 0), i32 9, i1 false)
+; MSSA-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 9
+; MSSA-NEXT:    store i8 0, i8* [[P10]], align 1
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP]], i8* [[P]], i32 10, i1 false)
+; MSSA-NEXT:    call void @external()
+; MSSA-NEXT:    [[HEAP:%.*]] = call i8* @malloc(i32 9)
+; MSSA-NEXT:    call void @llvm.memmove.p0i8.p0i8.i32(i8* [[HEAP]], i8* align 1 getelementptr inbounds ([9 x i8], [9 x i8]* @str, i32 0, i32 0), i32 9, i1 false)
+; MSSA-NEXT:    call void @useit(i8* [[HEAP]])
+; MSSA-NEXT:    ret i32 0
 ;
   %tmpmem = alloca [10 x i8], align 4
   %tmp = getelementptr inbounds [10 x i8], [10 x i8]* %tmpmem, i32 0, i32 0