diff --git a/llvm/include/llvm/Analysis/CaptureTracking.h b/llvm/include/llvm/Analysis/CaptureTracking.h
--- a/llvm/include/llvm/Analysis/CaptureTracking.h
+++ b/llvm/include/llvm/Analysis/CaptureTracking.h
@@ -96,6 +96,13 @@
     /// capturing instructions that will not be passed into captured().
     virtual void tooManyUses() = 0;
 
+    /// visitUse - We found a use of a value derived from the pointer. This is
+    /// called after shouldExplore(). Return true to stop the traversal or
+    /// false to continue looking for more uses.
+    ///
+    /// U->getUser() is always an Instruction.
+    virtual bool visitUse(const Use *U);
+
     /// shouldExplore - This is the use of a value derived from the pointer.
     /// To prune the search (ie., assume that none of its users could possibly
     /// capture) return false. To search it, return true.
diff --git a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
--- a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
+++ b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
@@ -20,6 +20,7 @@
 namespace llvm {
 
 class AAResults;
+class AllocaInst;
 class BatchAAResults;
 class AssumptionCache;
 class CallBase;
@@ -33,6 +34,7 @@
 class MemorySSA;
 class MemorySSAUpdater;
 class MemSetInst;
+class PostDominatorTree;
 class StoreInst;
 class TargetLibraryInfo;
 class Value;
@@ -42,6 +44,7 @@
   AAResults *AA = nullptr;
   AssumptionCache *AC = nullptr;
   DominatorTree *DT = nullptr;
+  PostDominatorTree *PDT = nullptr;
   MemorySSA *MSSA = nullptr;
   MemorySSAUpdater *MSSAU = nullptr;
 
@@ -52,7 +55,8 @@
 
   // Glue for the old PM.
   bool runImpl(Function &F, TargetLibraryInfo *TLI, AAResults *AA,
-               AssumptionCache *AC, DominatorTree *DT, MemorySSA *MSSA);
+               AssumptionCache *AC, DominatorTree *DT, PostDominatorTree *PDT,
+               MemorySSA *MSSA);
 
 private:
   // Helper functions
@@ -74,6 +78,9 @@
   Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr,
                                     Value *ByteVal);
   bool moveUp(StoreInst *SI, Instruction *P, const LoadInst *LI);
+  bool performStackMoveOptzn(Instruction *Load, Instruction *Store,
+                             AllocaInst *DestAlloca, AllocaInst *SrcAlloca,
+                             uint64_t Size);
 
   void eraseInstruction(Instruction *I);
   bool iterateOnFunction(Function &F);
diff --git a/llvm/lib/Analysis/CaptureTracking.cpp b/llvm/lib/Analysis/CaptureTracking.cpp
--- a/llvm/lib/Analysis/CaptureTracking.cpp
+++ b/llvm/lib/Analysis/CaptureTracking.cpp
@@ -55,6 +55,8 @@
 
 CaptureTracker::~CaptureTracker() = default;
 
+bool CaptureTracker::visitUse(const Use *U) { return false; }
+
 bool CaptureTracker::shouldExplore(const Use *U) { return true; }
 
 bool CaptureTracker::isDereferenceableOrNull(Value *O, const DataLayout &DL) {
@@ -444,7 +446,7 @@
   Worklist.reserve(getDefaultMaxUsesToExploreForCaptureTracking());
   SmallSet<const Use *, 20> Visited;
 
-  auto AddUses = [&](const Value *V) {
+  auto VisitUses = [&](const Value *V) {
     for (const Use &U : V->uses()) {
       // If there are lots of uses, conservatively say that the value
       // is captured to avoid taking too much compile time.
@@ -460,7 +462,7 @@
     }
     return true;
   };
-  if (!AddUses(V))
+  if (!VisitUses(V))
     return;
 
   auto IsDereferenceableOrNull = [Tracker](Value *V, const DataLayout &DL) {
@@ -468,6 +470,8 @@
   };
   while (!Worklist.empty()) {
     const Use *U = Worklist.pop_back_val();
+    if (Tracker->visitUse(U))
+      return;
     switch (DetermineUseCaptureKind(*U, IsDereferenceableOrNull)) {
     case UseCaptureKind::NO_CAPTURE:
       continue;
@@ -476,7 +480,7 @@
         return;
       continue;
     case UseCaptureKind::PASSTHROUGH:
-      if (!AddUses(U->getUser()))
+      if (!VisitUses(U->getUser()))
         return;
       continue;
     }
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -12,7 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/MemCpyOptimizer.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/Bitfields.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -25,6 +28,7 @@
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
@@ -66,12 +70,18 @@
 static cl::opt<bool> EnableMemCpyOptWithoutLibcalls(
     "enable-memcpyopt-without-libcalls", cl::Hidden,
     cl::desc("Enable memcpyopt even when libcalls are disabled"));
+static cl::opt<unsigned>
+    MemCpyOptStackMoveThreshold("memcpyopt-stack-move-threshold", cl::Hidden,
+                                cl::desc("Maximum number of basic blocks the "
+                                         "stack-move optimization may examine"),
+                                cl::init(250));
 
 STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted");
 STATISTIC(NumMemSetInfer, "Number of memsets inferred");
 STATISTIC(NumMoveToCpy,   "Number of memmoves converted to memcpy");
 STATISTIC(NumCpyToSet,    "Number of memcpys converted to memset");
 STATISTIC(NumCallSlot,    "Number of call slot optimizations performed");
+STATISTIC(NumStackMove, "Number of stack-move optimizations performed");
 
 namespace {
 
@@ -276,6 +286,8 @@
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addRequired<PostDominatorTreeWrapperPass>();
+    AU.addPreserved<PostDominatorTreeWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
     AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.addRequired<AAResultsWrapperPass>();
@@ -296,6 +308,7 @@
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
@@ -799,6 +812,25 @@
         ++NumMemCpyInstr;
         return true;
       }
+
+      // If this is a load-store pair from a stack slot to a stack slot, we
+      // might be able to perform the stack-move optimization just as we do for
+      // memcpys from an alloca to an alloca.
+      if (AllocaInst *DestAlloca =
+              dyn_cast<AllocaInst>(SI->getPointerOperand())) {
+        if (AllocaInst *SrcAlloca =
+                dyn_cast<AllocaInst>(LI->getPointerOperand())) {
+          if (performStackMoveOptzn(LI, SI, DestAlloca, SrcAlloca,
+                                    DL.getTypeStoreSize(T))) {
+            // Avoid invalidating the iterator.
+            BBI = SI->getNextNonDebugInstruction()->getIterator();
+            eraseInstruction(SI);
+            eraseInstruction(LI);
+            ++NumMemCpyInstr;
+            return true;
+          }
+        }
+      }
     }
   }
 
@@ -1433,6 +1465,577 @@
   return true;
 }
 
+// These helper classes are used for the stack-move optimization. See the
+// comments above performStackMoveOptzn() for more details.
+
+namespace {
+
+// Tracks liveness on the basic block level. This is conservative; see the
+// comments above performStackMoveOptzn() for justification.
+class BasicBlockLiveness {
+  // The earliest definition or use we've seen, combined with the three bits
+  // below.
+  PointerIntPair<Instruction *, 3> Value;
+
+  // Whether the alloca is live-in to the block (from predecessor basic blocks).
+  using LiveIn = Bitfield::Element<bool, 0, 1>;
+  // Whether the alloca is live-out from the block (to successor basic blocks).
+  using LiveOut = Bitfield::Element<bool, 1, 1>;
+  // Whether there's at least one use of the alloca in this basic block. This
+  // flag is important for detecting liveness conflicts, since the other
+  // information stored here isn't sufficient to determine that a use is present
+  // if a definition precedes it.
+  using HasUse = Bitfield::Element<bool, 2, 1>;
+
+  // Records a new def or use instruction.
+  void setDefUseInst(Instruction *I) {
+    assert((!hasDefUseInst() || I->comesBefore(getDefUseInst())) &&
+           "Tried to overwrite an earlier def or use with a later one!");
+    Value.setPointer(I);
+  }
+
+  // Sets the flag which determines whether this block has a use.
+  void setHasUse(bool On) {
+    unsigned V = Value.getInt();
+    Bitfield::set<HasUse>(V, On);
+    Value.setInt(V);
+  }
+
+public:
+  BasicBlockLiveness() : Value(nullptr) {}
+
+  // Returns the earliest definition or use we've seen in this block.
+  Instruction *getDefUseInst() const { return Value.getPointer(); }
+  // Returns true if there's a definition or use of the memory in this block.
+  bool hasDefUseInst() const { return Value.getPointer() != nullptr; }
+  // Returns true if the memory is live-in to this block (i.e. live-out of a
+  // predecessor).
+  bool isLiveIn() const { return Bitfield::get<LiveIn>(Value.getInt()); }
+  // Returns true if the memory is live-out of this block (i.e. live-in to a
+  // successor).
+  bool isLiveOut() const { return Bitfield::get<LiveOut>(Value.getInt()); }
+  // Returns true if there is at least one use of the memory in this block.
+  bool hasUse() const { return Bitfield::get<HasUse>(Value.getInt()); }
+  // Returns true if this alloca is live anywhere in this block or has
+  // at least one use in it. If this returns false, the alloca is
+  // guaranteed to be completely dead within this basic block.
+  bool isLiveAnywhereOrHasUses() const {
+    return isLiveIn() || isLiveOut() || hasUse();
+  }
+
+  // Records a new definition or use of the alloca being tracked within this
+  // basic block.
+  void update(Instruction *I, bool IsDef) {
+    if (!hasDefUseInst() || I->comesBefore(getDefUseInst())) {
+      setDefUseInst(I);
+      setLiveIn(!IsDef);
+    }
+    if (!IsDef)
+      setHasUse(true);
+  }
+
+  // Adjusts the live-in flag for this block.
+  void setLiveIn(bool On) {
+    unsigned V = Value.getInt();
+    Bitfield::set<LiveIn>(V, On);
+    Value.setInt(V);
+  }
+
+  // Adjusts the live-out flag for this block.
+  void setLiveOut(bool On) {
+    unsigned V = Value.getInt();
+    Bitfield::set<LiveOut>(V, On);
+    Value.setInt(V);
+  }
+};
+
+using BasicBlockLivenessMap = DenseMap<BasicBlock *, BasicBlockLiveness>;
+
+// Tracks uses of an alloca for the purposes of the stack-move optimization.
+//
+// This class does three things: (1) it makes sure that the alloca is never
+// captured; (2) it records defs and uses of the alloca in a map for the
+// liveness analysis to use; (3) it finds the nearest dominator and
+// postdominator of all uses of this alloca for the purpose of lifetime
+// intrinsic "shrink wrapping" if the optimization goes through.
+class StackMoveTracker : public CaptureTracker {
+  // Data layout info.
+  const DataLayout &DL;
+  // Dominator tree info.
+  DominatorTree &DT;
+  // Postdominator tree info.
+  PostDominatorTree &PDT;
+  // The memcpy instruction.
+  Instruction *Store;
+  // The size of the underlying alloca, in bits.
+  TypeSize AllocaSizeInBits;
+
+public:
+  // Keeps track of the lifetime intrinsics that we find. We'll need to remove
+  // these if the optimization goes through.
+  SmallVector<IntrinsicInst *, 4> LifetimeMarkers;
+  // Keeps track of instructions that have !noalias metadata. We need to drop
+  // that metadata if the optimization succeeds.
+  std::vector<Instruction *> NoAliasInstrs;
+  // Liveness information for this alloca, tracked on the basic block level.
+  BasicBlockLivenessMap BBLiveness;
+  // Liveness information for this alloca, tracked on the instruction level for
+  // the single basic block containing the memcpy.
+  DenseMap<Instruction *, bool> StoreBBDefUseMap;
+  // The nearest basic block that dominates all uses of the alloca that we've
+  // seen so far. This is only null if we haven't seen any uses yet.
+  BasicBlock *Dom;
+  // The nearest basic block that postdominates all uses of the alloca that
+  // we've seen so far. This can be null if there's no such postdominator.
+  BasicBlock *PostDom;
+  // The user that caused us to bail out, if any.
+  User *AbortingUser;
+  // Whether we should bail out of the stack-move optimization.
+  bool Abort;
+
+  StackMoveTracker(Instruction *Store, AllocaInst *Alloca, DominatorTree &DT,
+                   PostDominatorTree &PDT)
+      : DL(Store->getModule()->getDataLayout()), DT(DT), PDT(PDT), Store(Store),
+        AllocaSizeInBits(*Alloca->getAllocationSizeInBits(DL)), Dom(nullptr),
+        PostDom(nullptr), AbortingUser(nullptr), Abort(false) {}
+
+private:
+  // Called whenever we see a use or a definition of the alloca. If IsDef is
+  // true, this is a def; otherwise, it's a use.
+  void recordUseOrDef(Instruction *I, bool IsDef) {
+    BasicBlock *BB = I->getParent();
+    BBLiveness[BB].update(I, IsDef);
+
+    // For the basic block containing the store, track liveness on the
+    // instruction level.
+    if (BB == Store->getParent())
+      StoreBBDefUseMap[I] = IsDef;
+
+    // If the instruction has !noalias metadata, record it so that we can delete
+    // the metadata if the optimization succeeds.
+    if (I->hasMetadata(LLVMContext::MD_noalias))
+      NoAliasInstrs.push_back(I);
+  }
+
+public:
+  // If there are too many uses, just bail out to avoid spending excessive
+  // compile time.
+  void tooManyUses() override { Abort = true; }
+
+  // If the pointer was captured, we can't usefully track it, so just bail out.
+  bool captured(const Use *U) override {
+    if (!Abort) {
+      AbortingUser = U->getUser();
+      Abort = true;
+      return true;
+    }
+
+    return false;
+  }
+
+  // Classifies a use as either a true use or a definition, records that, and
+  // updates the nearest common dominator and postdominator accordingly.
+  bool visitUse(const Use *U) override {
+    Instruction *I = cast<Instruction>(U->getUser());
+    BasicBlock *BB = I->getParent();
+
+    // GEPs don't count as uses of the alloca memory (just of the pointer to the
+    // alloca), so we don't care about them here.
+    if (isa<GetElementPtrInst>(I) && U->getOperandNo() == 0)
+      return false;
+
+    // Update the nearest common dominator and postdominator. We know that this
+    // is the first use if Dom is null, because multiple blocks always have a
+    // mutual common dominator (though not necessarily a common postdominator).
+    if (Dom == nullptr) {
+      Dom = PostDom = BB;
+    } else {
+      Dom = DT.findNearestCommonDominator(Dom, BB);
+      if (PostDom != nullptr)
+        PostDom = PDT.findNearestCommonDominator(PostDom, BB);
+    }
+
+    // If an instruction overwrites all bytes of the alloca, it's a definition,
+    // not a use. Detect those cases here.
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+      if (II->isLifetimeStartOrEnd()) {
+        // We treat a call to a lifetime intrinsic that covers the entire alloca
+        // as a definition, since both llvm.lifetime.start and llvm.lifetime.end
+        // intrinsics conceptually fill all the bytes of the alloca with an
+        // undefined value. We also note these locations of these intrinsic
+        // calls so that we can delete them later if the optimization succeeds.
+        int64_t Size = cast<ConstantInt>(II->getArgOperand(0))->getSExtValue();
+        if (Size < 0 || uint64_t(Size) * 8 == AllocaSizeInBits) {
+          recordUseOrDef(II, true);
+          LifetimeMarkers.push_back(II);
+          return false;
+        }
+      } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(II)) {
+        if (MI->getArgOperandNo(U) == 0) {
+          if (ConstantInt *CI = dyn_cast<ConstantInt>(MI->getLength())) {
+            if (CI->getZExtValue() * 8 == AllocaSizeInBits.getFixedSize()) {
+              // Memcpy, memmove, and memset instructions that fill every byte
+              // of the alloca are definitions.
+              recordUseOrDef(MI, true);
+              return false;
+            }
+          }
+        }
+      }
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+      // Stores that overwrite all bytes of the alloca are definitions.
+      if (U->getOperandNo() == 1 &&
+          DL.getTypeStoreSizeInBits(SI->getValueOperand()->getType()) ==
+              AllocaSizeInBits.getFixedSize()) {
+        recordUseOrDef(SI, true);
+        return false;
+      }
+    }
+
+    // Otherwise, this instruction is a use. Make a note of that fact and
+    // continue.
+    recordUseOrDef(I, false);
+    return false;
+  }
+};
+
+} // namespace
+
+// Performs liveness dataflow analysis for an alloca at the basic block level as
+// part of the stack-move optimization.
+//
+// This implements the "backwards variable-at-a-time" variant of liveness
+// analysis, propagating liveness information backwards from uses until it sees
+// a basic block with a definition or one in which the variable is already
+// live-out. As implemented, this is a linear-time algorithm, because it only
+// visits every basic block at most once and the number of tracked variables is
+// constant (two--the source and destination of the memcpy).
+//
+// In order to avoid spending too much compile time, this operates on the level
+// of basic blocks instead of instructions, making it a conservative
+// analysis. See the comments in performStackMoveOptzn() for more details.
+//
+// Returns true if the analysis succeeded or false if it failed due to examining
+// too many basic blocks.
+static bool computeLiveness(BasicBlockLivenessMap &BBLiveness) {
+  // Start by initializing a worklist with all basic blocks that are live-in
+  // (i.e. they potentially need to propagate liveness to their predecessors).
+  SmallVector<BasicBlock *, 8> Worklist;
+  for (auto &Pair : BBLiveness) {
+    if (Pair.second.isLiveIn())
+      Worklist.push_back(Pair.first);
+  }
+
+  // Iterate until we have no more blocks to process.
+  unsigned Count = 0;
+  while (!Worklist.empty()) {
+    BasicBlock *BB = Worklist.back();
+    Worklist.pop_back();
+
+    // Cap the number of basic blocks we examine in order to avoid blowing up
+    // compile time. The default threshold was empirically determined to be
+    // sufficient 90% of the time in the Rust compiler.
+    ++Count;
+    if (Count >= MemCpyOptStackMoveThreshold) {
+      LLVM_DEBUG(
+          dbgs()
+          << "Stack Move: Exceeded max basic block threshold, bailing\n");
+      return false;
+    }
+
+    // We know that the alloca must be live-in to this basic block, or else we
+    // wouldn't have added the block to the worklist in the first place.
+    assert(BBLiveness.lookup(BB).isLiveIn() &&
+           "Shouldn't have added a BB that wasn't live-in to the worklist!");
+
+    // Propagate liveness back to predecessors.
+    for (BasicBlock *Pred : predecessors(BB)) {
+      BasicBlockLiveness PredLiveness = BBLiveness.lookup(Pred);
+
+      // Skip predecessors in which the variable is already known to be
+      // live-out.
+      if (!PredLiveness.isLiveOut()) {
+        PredLiveness.setLiveOut(true);
+
+        // Don't enqueue predecessors if they contain direct defs or uses of the
+        // variable. If a predecessor contains a use of the variable that
+        // dominates all the other uses or defs of the variable within that
+        // block, then we already added that predecessor to the worklist at the
+        // beginning of this procedure, so we don't need to add it again. If, on
+        // the other hand, the predecessor contains a definition of the variable
+        // that dominates all the other uses or defs of the variable within the
+        // block, then the predecessor won't propagate any liveness to *its*
+        // predecessors, so we don't need to enqueue it either.
+        if (!PredLiveness.hasDefUseInst()) {
+          // We know that this predecessor is a basic block that contains
+          // neither defs nor uses of the variable and in which the variable is
+          // live-out. So the variable must be live-in to this predecessor too.
+          PredLiveness.setLiveIn(true);
+          Worklist.push_back(Pred);
+        }
+
+        BBLiveness[Pred] = PredLiveness;
+      }
+    }
+  }
+
+  return true;
+}
+
+// Returns true if the alloca is at the start of the entry block, modulo a few
+// instructions like GEPs and debug info. We only perform the stack-move
+// optimization for such allocas, which simplifies the logic.
+static bool allocaIsAtStartOfEntryBlock(AllocaInst *AI) {
+  BasicBlock *BB = AI->getParent();
+  if (!BB->isEntryBlock()) {
+    LLVM_DEBUG(dbgs() << "Stack Move: Alloca isn't in entry block\n");
+    return false;
+  }
+
+  for (Instruction &I : *BB) {
+    if (&I == AI)
+      return true;
+    if (isa<AllocaInst>(I) || isa<GetElementPtrInst>(I) ||
+        isa<DbgInfoIntrinsic>(I) || I.isLifetimeStartOrEnd()) {
+      continue;
+    }
+    LLVM_DEBUG(
+        dbgs()
+        << "Stack Move: Alloca isn't at start of entry block\n  Instruction:"
+        << I << "\n");
+    return false;
+  }
+
+  llvm_unreachable("Alloca wasn't found in its parent basic block");
+}
+
+// Attempts to optimize the pattern whereby memory is copied from an alloca to
+// another alloca, where the two allocas aren't live simultaneously except
+// during the transfer. If successful, the two allocas can be merged into one
+// and the transfer can be deleted. This pattern is generated frequently in
+// Rust, due to the ubiquity of move operations in that language.
+//
+// We choose to limit this optimization to cases in which neither alloca was
+// captured, in order to avoid interprocedural analysis. As it turns out, the
+// same CaptureTracking framework that is needed to detect this condition also
+// turns out to be useful for gathering definitions and uses. So our general
+// approach is to run CaptureTracking to find captures and simultaneously gather
+// up uses and defs, followed by the standard liveness dataflow analysis to
+// ensure that the source and destination aren't simultaneously live anywhere.
+//
+// To avoid blowing up compile time, we perform the liveness analysis
+// conservatively on the basic block level rather than on the instruction level,
+// with the exception of the basic block containing the memcpy itself. This
+// means that any basic block that contains a use of both the source and
+// destination causes us to conservatively bail out, even if the source and
+// destination aren't actually simultaneously live. Empirically, this happens
+// less than 2% of the time in typical Rust code, making the
+// precision/compile-time tradeoff well worth it.
+//
+// Once we determine that the optimization is safe to perform, we replace all
+// uses of the destination alloca with the source alloca. We also "shrink wrap"
+// the lifetime markers of the single merged alloca to the nearest dominating
+// and postdominating basic block. Note that the "shrink wrapping" procedure is
+// a safe transformation only because we restrict the scope of this optimization
+// to allocas that aren't captured.
+bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
+                                          AllocaInst *DestAlloca,
+                                          AllocaInst *SrcAlloca,
+                                          uint64_t Size) {
+  // If the optimization is disabled, forget it.
+  if (MemCpyOptStackMoveThreshold == 0)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Stack Move: Attempting to optimize:\n"
+                    << *Store << "\n");
+
+  // Make sure the two allocas are in the same address space.
+  if (SrcAlloca->getAddressSpace() != DestAlloca->getAddressSpace()) {
+    LLVM_DEBUG(dbgs() << "Stack Move: Address space mismatch\n");
+    return false;
+  }
+
+  // Calculate the static size of the allocas to be merged, bailing out if we
+  // can't.
+  const DataLayout &DL = DestAlloca->getModule()->getDataLayout();
+  std::optional<TypeSize> SrcSize = SrcAlloca->getAllocationSizeInBits(DL);
+  if (!SrcSize || SrcSize->isScalable() ||
+      Size * 8 != SrcSize->getFixedSize()) {
+    LLVM_DEBUG(dbgs() << "Stack Move: Source alloca size mismatch\n");
+    return false;
+  }
+  std::optional<TypeSize> DestSize = DestAlloca->getAllocationSizeInBits(DL);
+  if (!DestSize || DestSize->isScalable() ||
+      Size * 8 != DestSize->getFixedSize()) {
+    LLVM_DEBUG(dbgs() << "Stack Move: Destination alloca size mismatch\n");
+    return false;
+  }
+
+  // Make sure the allocas are at the start of the entry block. This lets us
+  // avoid having to do annoying checks to ensure the allocas dominate their
+  // uses, as well as problems related to llvm.stacksave and llvm.stackrestore
+  // intrinsics.
+  if (!allocaIsAtStartOfEntryBlock(DestAlloca) ||
+      !allocaIsAtStartOfEntryBlock(SrcAlloca)) {
+    return false;
+  }
+
+  // Gather up all uses of the destination. Make sure that it wasn't captured
+  // anywhere.
+  StackMoveTracker DestTracker(Store, DestAlloca, *DT, *PDT);
+  PointerMayBeCaptured(DestAlloca, &DestTracker);
+  if (DestTracker.Abort) {
+    LLVM_DEBUG({
+      dbgs() << "Stack Move: Destination was captured:";
+      if (DestTracker.AbortingUser != nullptr)
+        dbgs() << "\n" << *DestTracker.AbortingUser;
+      dbgs() << "\n";
+    });
+    return false;
+  }
+
+  // Likewise, collect all uses of the source, again making sure that it wasn't
+  // captured anywhere.
+  StackMoveTracker SrcTracker(Store, SrcAlloca, *DT, *PDT);
+  PointerMayBeCaptured(SrcAlloca, &SrcTracker);
+  if (SrcTracker.Abort) {
+    LLVM_DEBUG({
+      dbgs() << "Stack Move: Source was captured:";
+      if (SrcTracker.AbortingUser != nullptr)
+        dbgs() << "\n" << *SrcTracker.AbortingUser;
+      dbgs() << "\n";
+    });
+    return false;
+  }
+
+  // Compute liveness on the basic block level.
+  BasicBlock *StoreBB = Store->getParent();
+  if (!computeLiveness(DestTracker.BBLiveness) ||
+      !computeLiveness(SrcTracker.BBLiveness)) {
+    return false;
+  }
+
+  // Check for liveness conflicts on the basic block level (with the exception
+  // of the basic block containing the memcpy). This is conservative compared to
+  // computing liveness on the instruction level. The precision loss is only 2%
+  // on the Rust compiler, however, making this compile-time tradeoff
+  // worthwhile.
+  for (auto DestPair : DestTracker.BBLiveness) {
+    BasicBlock *BB = DestPair.first;
+    if (BB != StoreBB && DestPair.second.isLiveAnywhereOrHasUses() &&
+        SrcTracker.BBLiveness.lookup(BB).isLiveAnywhereOrHasUses()) {
+      LLVM_DEBUG(dbgs() << "Stack Move: Detected liveness conflict, "
+                           "bailing:\n  Basic Block: "
+                        << BB->getNameOrAsOperand() << "\n");
+      return false;
+    }
+  }
+
+  // Check liveness inside the single basic block containing the load and
+  // store.
+  bool DestLive = DestTracker.BBLiveness.lookup(StoreBB).isLiveOut();
+  bool SrcLive = SrcTracker.BBLiveness.lookup(StoreBB).isLiveOut();
+  for (auto &BI : reverse(*StoreBB)) {
+    if (DestLive && SrcLive && &BI != Load && &BI != Store) {
+      LLVM_DEBUG(
+          dbgs() << "Stack Move: Detected liveness conflict inside the basic "
+                    "block containing the memcpy, bailing:\n  Instruction: "
+                 << BI << "\n");
+      return false;
+    }
+
+    auto DestDefUseIt = DestTracker.StoreBBDefUseMap.find(&BI);
+    auto SrcDefUseIt = SrcTracker.StoreBBDefUseMap.find(&BI);
+    if (DestDefUseIt != DestTracker.StoreBBDefUseMap.end())
+      DestLive = !DestDefUseIt->second;
+    if (SrcDefUseIt != SrcTracker.StoreBBDefUseMap.end())
+      SrcLive = !SrcDefUseIt->second;
+  }
+
+  // We can do the transformation. First, align the allocas appropriately.
+  SrcAlloca->setAlignment(
+      std::max(SrcAlloca->getAlign(), DestAlloca->getAlign()));
+
+  // Merge the two allocas.
+  DestAlloca->replaceAllUsesWith(SrcAlloca);
+
+  // Drop metadata on the source alloca.
+  SrcAlloca->dropUnknownNonDebugMetadata();
+
+  // Now "shrink wrap" the lifetimes. Begin by creating a new lifetime start
+  // marker at the start of the nearest common dominator of all defs and uses of
+  // the merged alloca.
+  //
+  // We could be more precise here and query AA to find the latest point in the
+  // basic block at which to place the call to the intrinsic, but that doesn't
+  // seem worth it at the moment.
+  assert(DestTracker.Dom != nullptr && SrcTracker.Dom != nullptr &&
+         "There must be a common dominator for all defs and uses of the source "
+         "and destination");
+  Type *IntPtrTy =
+      Type::getIntNTy(SrcAlloca->getContext(), DL.getPointerSizeInBits());
+  ConstantInt *CI = cast<ConstantInt>(ConstantInt::get(IntPtrTy, Size));
+  BasicBlock *Dom =
+      DT->findNearestCommonDominator(DestTracker.Dom, SrcTracker.Dom);
+  BasicBlock::iterator InsertionPt = Dom->getFirstNonPHIOrDbgOrAlloca();
+  if (Dom == SrcAlloca->getParent() && InsertionPt != Dom->end() &&
+      InsertionPt->comesBefore(SrcAlloca)) {
+    // Make sure that the alloca dominates the lifetime start intrinsic.
+    // Usually, the call to getFirstNonPHIOrDbgOrAlloca() above ensures that,
+    // but if the allocas aren't all at the start of the basic block we might
+    // have to fix things up.
+    InsertionPt = ++BasicBlock::iterator(SrcAlloca);
+  }
+  IRBuilder<>(Dom, InsertionPt).CreateLifetimeStart(SrcAlloca, CI);
+
+  // Next, create a new lifetime end marker at the end of the nearest common
+  // postdominator of all defs and uses of the merged alloca, if there is one.
+  // If there's no such postdominator, just don't bother; we could create one at
+  // each exit block, but that'd be essentially semantically meaningless.
+  if (DestTracker.PostDom != nullptr && SrcTracker.PostDom != nullptr) {
+    if (BasicBlock *PostDom = PDT->findNearestCommonDominator(
+            DestTracker.PostDom, SrcTracker.PostDom)) {
+      // Edge case: It's possible that the terminating instruction of the
+      // postdominating basic block is itself an invoke instruction that uses
+      // the alloca. Placing the lifetime end intrinsic before that call would
+      // be incorrect. Detect this situation and choose the next postdominator
+      // instead.
+      MemoryLocation Loc = MemoryLocation::getBeforeOrAfter(SrcAlloca);
+      if (isModOrRefSet(AA->getModRefInfo(PostDom->getTerminator(), Loc))) {
+        auto PostDomNode = (*PDT)[PostDom]->getIDom();
+        PostDom = PostDomNode != nullptr ? PostDomNode->getBlock() : nullptr;
+      }
+
+      // Add the lifetime end intrinsic.
+      if (PostDom != nullptr) {
+        IRBuilder<>(PostDom, BasicBlock::iterator(PostDom->getTerminator()))
+            .CreateLifetimeEnd(SrcAlloca, CI);
+      }
+    }
+  }
+
+  // Remove all other lifetime markers.
+  for (IntrinsicInst *II : DestTracker.LifetimeMarkers)
+    eraseInstruction(II);
+  for (IntrinsicInst *II : SrcTracker.LifetimeMarkers)
+    eraseInstruction(II);
+
+  // As this transformation can cause memory accesses that didn't previously
+  // alias to begin to alias one another, we remove !noalias metadata from any
+  // uses of either alloca. This is conservative, but more precision doesn't
+  // seem worthwhile right now.
+  for (Instruction *I : DestTracker.NoAliasInstrs)
+    I->setMetadata(LLVMContext::MD_noalias, nullptr);
+  for (Instruction *I : SrcTracker.NoAliasInstrs)
+    I->setMetadata(LLVMContext::MD_noalias, nullptr);
+
+  // We're done! We don't need to delete the memcpy because later passes will do
+  // it.
+  LLVM_DEBUG(dbgs() << "Stack Move: Performed stack-move optimization\n");
+  ++NumStackMove;
+  return true;
+}
+
 /// Perform simplification of memcpy's.  If we have memcpy A
 /// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite
 /// B to be a memcpy from X to Z (or potentially a memmove, depending on
@@ -1490,13 +2093,14 @@
   MemoryAccess *SrcClobber = MSSA->getWalker()->getClobberingMemoryAccess(
       AnyClobber, MemoryLocation::getForSource(M), BAA);
 
-  // There are four possible optimizations we can do for memcpy:
+  // There are five possible optimizations we can do for memcpy:
   //   a) memcpy-memcpy xform which exposes redundance for DSE.
   //   b) call-memcpy xform for return slot optimization.
   //   c) memcpy from freshly alloca'd space or space that has just started
   //      its lifetime copies undefined data, and we can therefore eliminate
   //      the memcpy in favor of the data that was already at the destination.
   //   d) memcpy from a just-memset'd source can be turned into memset.
+  //   e) elimination of memcpy via stack-move optimization.
   if (auto *MD = dyn_cast<MemoryDef>(SrcClobber)) {
     if (Instruction *MI = MD->getMemoryInst()) {
       if (auto *CopySize = dyn_cast<ConstantInt>(M->getLength())) {
@@ -1518,8 +2122,10 @@
           }
         }
       }
-      if (auto *MDep = dyn_cast<MemCpyInst>(MI))
-        return processMemCpyMemCpyDependence(M, MDep, BAA);
+      if (auto *MDep = dyn_cast<MemCpyInst>(MI)) {
+        if (processMemCpyMemCpyDependence(M, MDep, BAA))
+          return true;
+      }
       if (auto *MDep = dyn_cast<MemSetInst>(MI)) {
         if (performMemCpyToMemSetOptzn(M, MDep, BAA)) {
           LLVM_DEBUG(dbgs() << "Converted memcpy to memset\n");
@@ -1538,6 +2144,26 @@
     }
   }
 
+  // If the transfer is from a stack slot to a stack slot, then we may be able
+  // to perform the stack-move optimization. See the comments in
+  // performStackMoveOptzn() for more details.
+  AllocaInst *DestAlloca = dyn_cast<AllocaInst>(M->getDest());
+  if (DestAlloca == nullptr)
+    return false;
+  AllocaInst *SrcAlloca = dyn_cast<AllocaInst>(M->getSource());
+  if (SrcAlloca == nullptr)
+    return false;
+  ConstantInt *Len = dyn_cast<ConstantInt>(M->getLength());
+  if (Len == nullptr)
+    return false;
+  if (performStackMoveOptzn(M, M, DestAlloca, SrcAlloca, Len->getZExtValue())) {
+    // Avoid invalidating the iterator.
+    BBI = M->getNextNonDebugInstruction()->getIterator();
+    eraseInstruction(M);
+    ++NumMemCpyInstr;
+    return true;
+  }
+
   return false;
 }
 
@@ -1693,9 +2319,10 @@
   auto *AA = &AM.getResult<AAManager>(F);
   auto *AC = &AM.getResult<AssumptionAnalysis>(F);
   auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+  auto *PDT = &AM.getResult<PostDominatorTreeAnalysis>(F);
   auto *MSSA = &AM.getResult<MemorySSAAnalysis>(F);
 
-  bool MadeChange = runImpl(F, &TLI, AA, AC, DT, &MSSA->getMSSA());
+  bool MadeChange = runImpl(F, &TLI, AA, AC, DT, PDT, &MSSA->getMSSA());
   if (!MadeChange)
     return PreservedAnalyses::all();
 
@@ -1707,12 +2334,14 @@
 
 bool MemCpyOptPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
                             AliasAnalysis *AA_, AssumptionCache *AC_,
-                            DominatorTree *DT_, MemorySSA *MSSA_) {
+                            DominatorTree *DT_, PostDominatorTree *PDT_,
+                            MemorySSA *MSSA_) {
   bool MadeChange = false;
   TLI = TLI_;
   AA = AA_;
   AC = AC_;
   DT = DT_;
+  PDT = PDT_;
   MSSA = MSSA_;
   MemorySSAUpdater MSSAU_(MSSA_);
   MSSAU = &MSSAU_;
@@ -1738,7 +2367,8 @@
   auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto *PDT = &getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
   auto *MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
 
-  return Impl.runImpl(F, TLI, AA, AC, DT, MSSA);
+  return Impl.runImpl(F, TLI, AA, AC, DT, PDT, MSSA);
 }
diff --git a/llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll b/llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll
--- a/llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll
+++ b/llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -S -passes=memcpyopt | FileCheck --match-full-lines %s
+; RUN: opt < %s -S -passes=memcpyopt -memcpyopt-stack-move-threshold=0 | FileCheck --match-full-lines %s
 
 ; Alias scopes are merged by taking the intersection of domains, then the union of the scopes within those domains
 define i8 @test(i8 %input) {
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -190,6 +190,7 @@
 ; CHECK-O23SZ-NEXT: Running pass: GVNPass
 ; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis
 ; CHECK-O1-NEXT: Running pass: MemCpyOptPass
+; CHECK-O1-NEXT: Running analysis: PostDominatorTreeAnalysis
 ; CHECK-O-NEXT: Running pass: SCCPPass
 ; CHECK-O-NEXT: Running pass: BDCEPass
 ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
@@ -201,7 +202,7 @@
 ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis
 ; CHECK-O1-NEXT: Running pass: CoroElidePass
 ; CHECK-O-NEXT: Running pass: ADCEPass
-; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis
+; CHECK-O23SZ-NEXT: Running analysis: PostDominatorTreeAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: MemCpyOptPass
 ; CHECK-O23SZ-NEXT: Running pass: DSEPass
 ; CHECK-O23SZ-NEXT: Running pass: LoopSimplifyPass
diff --git a/llvm/test/Other/new-pm-lto-defaults.ll b/llvm/test/Other/new-pm-lto-defaults.ll
--- a/llvm/test/Other/new-pm-lto-defaults.ll
+++ b/llvm/test/Other/new-pm-lto-defaults.ll
@@ -110,8 +110,8 @@
 ; CHECK-O23SZ-NEXT: Running pass: GVNPass on foo
 ; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis on foo
 ; CHECK-O23SZ-NEXT: Running pass: MemCpyOptPass on foo
-; CHECK-O23SZ-NEXT: Running pass: DSEPass on foo
 ; CHECK-O23SZ-NEXT: Running analysis: PostDominatorTreeAnalysis on foo
+; CHECK-O23SZ-NEXT: Running pass: DSEPass on foo
 ; CHECK-O23SZ-NEXT: Running pass: MergedLoadStoreMotionPass on foo
 ; CHECK-O23SZ-NEXT: Running pass: LoopSimplifyPass on foo
 ; CHECK-O23SZ-NEXT: Running pass: LCSSAPass on foo
diff --git a/llvm/test/Other/new-pm-thinlto-defaults.ll b/llvm/test/Other/new-pm-thinlto-defaults.ll
--- a/llvm/test/Other/new-pm-thinlto-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-defaults.ll
@@ -163,6 +163,7 @@
 ; CHECK-O23SZ-NEXT: Running pass: GVNPass
 ; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis
 ; CHECK-O1-NEXT: Running pass: MemCpyOptPass
+; CHECK-O1-NEXT: Running analysis: PostDominatorTreeAnalysis
 ; CHECK-O-NEXT: Running pass: SCCPPass
 ; CHECK-O-NEXT: Running pass: BDCEPass
 ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
@@ -173,7 +174,7 @@
 ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis
 ; CHECK-O1-NEXT: Running pass: CoroElidePass
 ; CHECK-O-NEXT: Running pass: ADCEPass
-; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis
+; CHECK-O23SZ-NEXT: Running analysis: PostDominatorTreeAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: MemCpyOptPass
 ; CHECK-O23SZ-NEXT: Running pass: DSEPass
 ; CHECK-O23SZ-NEXT: Running pass: LoopSimplifyPass
diff --git a/llvm/test/Transforms/MemCpyOpt/callslot.ll b/llvm/test/Transforms/MemCpyOpt/callslot.ll
--- a/llvm/test/Transforms/MemCpyOpt/callslot.ll
+++ b/llvm/test/Transforms/MemCpyOpt/callslot.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=memcpyopt < %s -verify-memoryssa | FileCheck %s
+; RUN: opt -S -passes=memcpyopt -memcpyopt-stack-move-threshold=0 < %s -verify-memoryssa | FileCheck %s
 
 define i8 @read_dest_between_call_and_memcpy() {
 ; CHECK-LABEL: @read_dest_between_call_and_memcpy(
diff --git a/llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll b/llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll
--- a/llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll
+++ b/llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -S -passes=memcpyopt | FileCheck --match-full-lines %s
+; RUN: opt < %s -S -passes=memcpyopt -memcpyopt-stack-move-threshold=0 | FileCheck --match-full-lines %s
 
 ; Make sure callslot optimization merges alias.scope metadata correctly when it merges instructions.
 ; Merging here naively generates:
diff --git a/llvm/test/Transforms/MemCpyOpt/stack-move.ll b/llvm/test/Transforms/MemCpyOpt/stack-move.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/MemCpyOpt/stack-move.ll
@@ -0,0 +1,894 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; Tests that the stack-move optimization functions properly.
+;
+; RUN: opt -passes=memcpyopt -memcpyopt-stack-move-threshold=8 -S < %s | FileCheck %s
+
+%struct.Foo = type { i32, i32, i32 }
+
+@constant = private unnamed_addr constant %struct.Foo { i32 1, i32 2, i32 3 }, align 4
+
+; Optimization successes follow:
+
+; Tests that the optimization succeeds with a basic call to memcpy.
+define void @basic_memcpy() {
+; CHECK-LABEL: @basic_memcpy(
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr [[SRC]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr [[SRC]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca %struct.Foo, align 4
+  %dest = alloca %struct.Foo, align 4
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src)
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false)
+  %1 = call i32 @use_nocapture(ptr noundef nocapture %src)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src)
+  %2 = call i32 @use_nocapture(ptr noundef nocapture %dest)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest)
+  ret void
+}
+
+; Tests that the optimization succeeds with a basic call to memmove.
+define void @basic_memmove() {
+; CHECK-LABEL: @basic_memmove(
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr [[SRC]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr [[SRC]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca %struct.Foo, align 4
+  %dest = alloca %struct.Foo, align 4
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src)
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest)
+  call void @llvm.memmove.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false)
+  %1 = call i32 @use_nocapture(ptr noundef nocapture %src)
+  call void @llvm.memmove.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src)
+  %2 = call i32 @use_nocapture(ptr noundef nocapture %dest)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest)
+  ret void
+}
+
+; Tests that the optimization succeeds with a load/store pair.
+define void @load_store() {
+; CHECK-LABEL: @load_store(
+; CHECK-NEXT:    [[SRC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[DEST:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[SRC]])
+; CHECK-NEXT:    store i32 42, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[SRC]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca i32, align 4
+  %dest = alloca i32, align 4
+  call void @llvm.lifetime.start.p0(i64 4, ptr nocapture %src)
+  call void @llvm.lifetime.start.p0(i64 4, ptr nocapture %dest)
+  store i32 42, ptr %src
+  %1 = call i32 @use_nocapture(ptr noundef nocapture %src)
+  %2 = load i32, ptr %src
+  store i32 %2, ptr %dest
+  call void @llvm.lifetime.end.p0(i64 4, ptr nocapture %src)
+  %3 = call i32 @use_nocapture(ptr noundef nocapture %dest)
+  call void @llvm.lifetime.end.p0(i64 4, ptr nocapture %dest)
+  ret void
+}
+
+; Tests that merging two allocas with different alignments results in an
+; alloca with the broader alignment.
+define void @align_up() {
+; CHECK-LABEL: @align_up(
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 8
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr [[SRC]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr [[SRC]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca %struct.Foo, align 8
+  %dest = alloca %struct.Foo, align 4
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src)
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false)
+  %1 = call i32 @use_nocapture(ptr noundef nocapture %src)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src)
+  %2 = call i32 @use_nocapture(ptr noundef nocapture %dest)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest)
+  ret void
+}
+
+; Tests that we correctly remove extra lifetime intrinsics when performing the
+; optimization.
+define void @remove_extra_lifetime_intrinsics() {
+; CHECK-LABEL: @remove_extra_lifetime_intrinsics(
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr [[SRC]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr [[SRC]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca %struct.Foo, align 4
+  %dest = alloca %struct.Foo, align 4
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src)
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false)
+  %1 = call i32 @use_nocapture(ptr noundef nocapture %src)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src)
+  %2 = call i32 @use_nocapture(ptr noundef nocapture %dest)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src)
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src)
+  %3 = call i32 @use_nocapture(ptr noundef nocapture %dest)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src)
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest)
+  ret void
+}
+
+; Tests that we remove scoped noalias metadata from a call.
+define void @remove_scoped_noalias() {
+; CHECK-LABEL: @remove_scoped_noalias(
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr [[SRC]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]), !alias.scope !0
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr [[SRC]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca %struct.Foo, align 4
+  %dest = alloca %struct.Foo, align 4
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src)
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false)
+  %1 = call i32 @use_nocapture(ptr noundef nocapture %src), !alias.scope !2
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src)
+  %2 = call i32 @use_nocapture(ptr noundef nocapture %dest), !noalias !2
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest)
+  ret void
+}
+
+; Tests that we remove metadata on the merged alloca.
+define void @remove_alloca_metadata() {
+; CHECK-LABEL: @remove_alloca_metadata(
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr [[SRC]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]), !alias.scope !0
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr [[SRC]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca %struct.Foo, align 4, !annotation !3
+  %dest = alloca %struct.Foo, align 4
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src)
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false)
+  %1 = call i32 @use_nocapture(ptr noundef nocapture %src), !alias.scope !2
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src)
+  %2 = call i32 @use_nocapture(ptr noundef nocapture %dest), !noalias !2
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest)
+  ret void
+}
+
+; Tests that we correctly "shrinkwrap" lifetime intrinsics to the nearest
+; common dominator and postdominator when performing the optimization.
+define void @shrinkwrap_lifetimes() {
+; CHECK-LABEL: @shrinkwrap_lifetimes(
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr [[SRC]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[TMP1]], label [[BB0:%.*]], label [[BB1:%.*]]
+; CHECK:       bb0:
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false)
+; CHECK-NEXT:    br label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false)
+; CHECK-NEXT:    br label [[BB2]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[TMP3]], label [[BB3:%.*]], label [[BB4:%.*]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    br label [[BB5:%.*]]
+; CHECK:       bb4:
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    br label [[BB5]]
+; CHECK:       bb5:
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr [[SRC]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca %struct.Foo, align 4
+  %dest = alloca %struct.Foo, align 4
+  %1 = call i1 @cond()
+  br i1 %1, label %bb0, label %bb1
+
+bb0:
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false)
+  br label %bb2
+
+bb1:
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false)
+  br label %bb2
+
+bb2:
+  %2 = call i32 @use_nocapture(ptr noundef nocapture %src)
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false)
+  %3 = call i1 @cond()
+  br i1 %3, label %bb3, label %bb4
+
+bb3:
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src)
+  %4 = call i32 @use_nocapture(ptr noundef nocapture %dest)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest)
+  br label %bb5
+
+bb4:
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src)
+  %5 = call i32 @use_nocapture(ptr noundef nocapture %dest)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest)
+  br label %bb5
+
+bb5:
+  ret void
+}
+
+; Tests that GEP doesn't count as a use for the purposes of liveness analysis.
+define void @gep_isnt_a_use() {
+; CHECK-LABEL: @gep_isnt_a_use(
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr [[SRC]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [[STRUCT_FOO]], ptr [[SRC]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr [[SRC]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca %struct.Foo, align 4
+  %dest = alloca %struct.Foo, align 4
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src)
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false)
+  %1 = call i32 @use_nocapture(ptr noundef nocapture %src)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false)
+  %2 = getelementptr %struct.Foo, ptr %src, i32 0, i32 0
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src)
+  %3 = call i32 @use_nocapture(ptr noundef nocapture %dest)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest)
+  ret void
+}
+
+; Tests that a memcpy that completely overwrites a stack value is a definition
+; for the purposes of liveness analysis.
+define void @memcpy_is_def() {
+; CHECK-LABEL: @memcpy_is_def(
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr [[SRC]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false)
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr [[SRC]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca %struct.Foo, align 4
+  %dest = alloca %struct.Foo, align 4
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src)
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false)
+  %1 = call i32 @use_nocapture(ptr noundef nocapture %src)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false)
+  %2 = call i32 @use_nocapture(ptr noundef nocapture %dest)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false)
+  %3 = call i32 @use_nocapture(ptr noundef nocapture %src)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest)
+  ret void
+}
+
+; Tests that a memset that completely overwrites a stack value is a definition
+; for the purposes of liveness analysis.
+define void @memset_is_def() {
+; CHECK-LABEL: @memset_is_def(
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr [[SRC]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[SRC]], i8 42, i64 12, i1 false)
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr [[SRC]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca %struct.Foo, align 4
+  %dest = alloca %struct.Foo, align 4
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src)
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false)
+  %1 = call i32 @use_nocapture(ptr noundef nocapture %src)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false)
+  %2 = call i32 @use_nocapture(ptr noundef nocapture %dest)
+  call void @llvm.memset.p0.i64(ptr align 4 %src, i8 42, i64 12, i1 false)
+  %3 = call i32 @use_nocapture(ptr noundef nocapture %src)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest)
+  ret void
+}
+
+; Tests that a store that completely overwrites a stack value is a definition
+; for the purposes of liveness analysis.
+define void @store_is_def() {
+; CHECK-LABEL: @store_is_def(
+; CHECK-NEXT:    [[SRC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[DEST:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[SRC]])
+; CHECK-NEXT:    store i32 42, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    store i32 64, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[SRC]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca i32, align 4
+  %dest = alloca i32, align 4
+  call void @llvm.lifetime.start.p0(i64 4, ptr nocapture %src)
+  call void @llvm.lifetime.start.p0(i64 4, ptr nocapture %dest)
+  store i32 42, ptr %src
+  %1 = call i32 @use_nocapture(ptr noundef nocapture %src)
+  %2 = load i32, ptr %src
+  store i32 %2, ptr %dest
+  %3 = call i32 @use_nocapture(ptr noundef nocapture %dest)
+  store i32 64, ptr %src
+  %4 = call i32 @use_nocapture(ptr noundef nocapture %src)
+  call void @llvm.lifetime.end.p0(i64 4, ptr nocapture %src)
+  call void @llvm.lifetime.end.p0(i64 4, ptr nocapture %dest)
+  ret void
+}
+
+; Optimization failures follow:
+
+; Tests that the optimization fails with a load/store pair that isn't
+; block-local.
+define void @global_load_store() {
+; CHECK-LABEL: @global_load_store(
+; CHECK-NEXT:    [[SRC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[DEST:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nocapture [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nocapture [[DEST]])
+; CHECK-NEXT:    store i32 42, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[SRC]], align 4
+; CHECK-NEXT:    br label [[BB0:%.*]]
+; CHECK:       bb0:
+; CHECK-NEXT:    store i32 [[TMP2]], ptr [[DEST]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nocapture [[SRC]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nocapture [[DEST]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca i32, align 4
+  %dest = alloca i32, align 4
+  call void @llvm.lifetime.start.p0(i64 4, ptr nocapture %src)
+  call void @llvm.lifetime.start.p0(i64 4, ptr nocapture %dest)
+  store i32 42, ptr %src
+  %1 = call i32 @use_nocapture(ptr noundef nocapture %src)
+  %2 = load i32, ptr %src
+  br label %bb0
+
+bb0:
+  store i32 %2, ptr %dest
+  call void @llvm.lifetime.end.p0(i64 4, ptr nocapture %src)
+  %3 = call i32 @use_nocapture(ptr noundef nocapture %dest)
+  call void @llvm.lifetime.end.p0(i64 4, ptr nocapture %dest)
+  ret void
+}
+
+; Tests that dynamically-sized allocas are never merged.
+define void @dynamically_sized_alloca(i64 %i) {
+; CHECK-LABEL: @dynamically_sized_alloca(
+; CHECK-NEXT:    [[SRC:%.*]] = alloca i8, i64 [[I:%.*]], align 4
+; CHECK-NEXT:    [[DEST:%.*]] = alloca i8, i64 [[I]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 -1, ptr nocapture [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 -1, ptr nocapture [[DEST]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr nocapture [[SRC]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr nocapture [[DEST]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca i8, i64 %i, align 4
+  %dest = alloca i8, i64 %i, align 4
+  call void @llvm.lifetime.start.p0(i64 -1, ptr nocapture %src)
+  call void @llvm.lifetime.start.p0(i64 -1, ptr nocapture %dest)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false)
+  %1 = call i32 @use_nocapture(ptr noundef nocapture %src)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false)
+  call void @llvm.lifetime.end.p0(i64 -1, ptr nocapture %src)
+  %2 = call i32 @use_nocapture(ptr noundef nocapture %dest)
+  call void @llvm.lifetime.end.p0(i64 -1, ptr nocapture %dest)
+  ret void
+}
+
+; Tests that a memcpy with a dynamic size is never optimized.
+define void @dynamically_sized_memcpy(i64 %size) {
+; CHECK-LABEL: @dynamically_sized_memcpy(
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 [[SIZE:%.*]], i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[DEST]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca %struct.Foo, align 4
+  %dest = alloca %struct.Foo, align 4
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src)
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false)
+  %1 = call i32 @use_nocapture(ptr noundef nocapture %src)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 %size, i1 false)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src)
+  %2 = call i32 @use_nocapture(ptr noundef nocapture %dest)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest)
+  ret void
+}
+
+; Tests that allocas with different sizes aren't merged together.
+define void @mismatched_alloca_size() {
+; CHECK-LABEL: @mismatched_alloca_size(
+; CHECK-NEXT:    [[SRC:%.*]] = alloca i8, i64 24, align 4
+; CHECK-NEXT:    [[DEST:%.*]] = alloca i8, i64 12, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr nocapture [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nocapture [[SRC]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[DEST]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca i8, i64 24, align 4
+  %dest = alloca i8, i64 12, align 4
+  call void @llvm.lifetime.start.p0(i64 24, ptr nocapture %src)
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false)
+  %1 = call i32 @use_nocapture(ptr noundef nocapture %src)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false)
+  call void @llvm.lifetime.end.p0(i64 24, ptr nocapture %src)
+  %2 = call i32 @use_nocapture(ptr noundef nocapture %dest)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest)
+  ret void
+}
+
+; Tests that allocas with mismatched address spaces aren't combined.
+define void @mismatched_alloca_addrspace() {
+; CHECK-LABEL: @mismatched_alloca_addrspace(
+; CHECK-NEXT:    [[SRC:%.*]] = alloca i8, i64 24, align 4, addrspace(1)
+; CHECK-NEXT:    [[DEST:%.*]] = alloca i8, i64 12, align 4, addrspace(2)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p1(i64 24, ptr addrspace(1) nocapture [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p2(i64 12, ptr addrspace(2) nocapture [[DEST]])
+; CHECK-NEXT:    call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr addrspace(1) nocapture noundef [[SRC]])
+; CHECK-NEXT:    call void @llvm.memcpy.p2.p1.i64(ptr addrspace(2) align 4 [[DEST]], ptr addrspace(1) align 4 [[SRC]], i64 12, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p1(i64 24, ptr addrspace(1) nocapture [[SRC]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr addrspace(2) nocapture noundef [[DEST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p2(i64 12, ptr addrspace(2) nocapture [[DEST]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca i8, i64 24, align 4, addrspace(1)
+  %dest = alloca i8, i64 12, align 4, addrspace(2)
+  call void @llvm.lifetime.start.p1(i64 24, ptr addrspace(1) nocapture %src)
+  call void @llvm.lifetime.start.p2(i64 12, ptr addrspace(2) nocapture %dest)
+  call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 %src, ptr align 4 @constant, i64 12, i1 false)
+  %1 = call i32 @use_nocapture(ptr addrspace(1) noundef nocapture %src)
+  call void @llvm.memcpy.p2.p1.i64(ptr addrspace(2) align 4 %dest, ptr addrspace(1) align 4 %src, i64 12, i1 false)
+  call void @llvm.lifetime.end.p1(i64 24, ptr addrspace(1) nocapture %src)
+  %2 = call i32 @use_nocapture(ptr addrspace(2) noundef nocapture %dest)
+  call void @llvm.lifetime.end.p2(i64 12, ptr addrspace(2) nocapture %dest)
+  ret void
+}
+
+; Tests that volatile memcpys aren't removed.
+define void @volatile_memcpy() {
+; CHECK-LABEL: @volatile_memcpy(
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 true)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[DEST]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca %struct.Foo, align 4
+  %dest = alloca %struct.Foo, align 4
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src)
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 true)
+  %1 = call i32 @use_nocapture(ptr noundef nocapture %src)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 true)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src)
+  %2 = call i32 @use_nocapture(ptr noundef nocapture %dest)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest)
+  ret void
+}
+
+; Tests that the optimization isn't performed when the destination is captured.
+define void @dest_captured() {
+; CHECK-LABEL: @dest_captured(
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_capture(ptr noundef [[DEST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[DEST]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca %struct.Foo, align 4
+  %dest = alloca %struct.Foo, align 4
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src)
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false)
+  %1 = call i32 @use_nocapture(ptr noundef nocapture %src)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src)
+  %2 = call i32 @use_capture(ptr noundef %dest)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest)
+  ret void
+}
+
+; Tests that the optimization isn't performed when the source is captured.
+define void @src_captured() {
+; CHECK-LABEL: @src_captured(
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_capture(ptr noundef [[SRC]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[DEST]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca %struct.Foo, align 4
+  %dest = alloca %struct.Foo, align 4
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src)
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false)
+  %1 = call i32 @use_capture(ptr noundef %src)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src)
+  %2 = call i32 @use_nocapture(ptr noundef nocapture %dest)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest)
+  ret void
+}
+
+; Tests that the optimization isn't performed when the source and destination
+; are simultaneously live within the basic block.
+define void @local_liveness_conflict() {
+; CHECK-LABEL: @local_liveness_conflict(
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 @constant, i64 12, i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[DEST]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca %struct.Foo, align 4
+  %dest = alloca %struct.Foo, align 4
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src)
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false)
+  %1 = call i32 @use_nocapture(ptr noundef nocapture %src)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src)
+  %2 = call i32 @use_nocapture(ptr noundef nocapture %dest)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest)
+  ret void
+}
+
+; Tests that the optimization isn't performed when the source and destination
+; are simultaneously live in a way that can only be determined by examining
+; multiple basic blocks.
+define void @global_liveness_conflict() {
+; CHECK-LABEL: @global_liveness_conflict(
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false)
+; CHECK-NEXT:    [[TMP2:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[TMP2]], label [[BB0:%.*]], label [[BB1:%.*]]
+; CHECK:       bb0:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    br label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]])
+; CHECK-NEXT:    br label [[BB2]]
+; CHECK:       bb2:
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[DEST]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca %struct.Foo, align 4
+  %dest = alloca %struct.Foo, align 4
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src)
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false)
+  %1 = call i32 @use_nocapture(ptr noundef nocapture %src)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false)
+  %2 = call i1 @cond()
+  br i1 %2, label %bb0, label %bb1
+
+bb0:
+  %3 = call i32 @use_nocapture(ptr noundef nocapture %src)
+  br label %bb2
+
+bb1:
+  %4 = call i32 @use_nocapture(ptr noundef nocapture %dest)
+  br label %bb2
+
+bb2:
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest)
+  ret void
+}
+
+; Tests that a memcpy that doesn't completely overwrite a stack value is a use
+; for the purposes of liveness analysis, not a definition.
+define void @incomplete_memcpy_is_use() {
+; CHECK-LABEL: @incomplete_memcpy_is_use(
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 11, i1 false)
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false)
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[DEST]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca %struct.Foo, align 4
+  %dest = alloca %struct.Foo, align 4
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src)
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false)
+  %1 = call i32 @use_nocapture(ptr noundef nocapture %src)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 11, i1 false)
+  %2 = call i32 @use_nocapture(ptr noundef nocapture %dest)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false)
+  %3 = call i32 @use_nocapture(ptr noundef nocapture %src)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest)
+  ret void
+}
+
+; Tests that a store that doesn't completely overwrite a stack value is a use
+; for the purposes of liveness analysis, not a definition.
+define void @incomplete_store_is_use() {
+; CHECK-LABEL: @incomplete_store_is_use(
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[SRC]], align 4
+; CHECK-NEXT:    store i32 [[TMP2]], ptr [[DEST]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false)
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[DEST]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca %struct.Foo, align 4
+  %dest = alloca %struct.Foo, align 4
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src)
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false)
+  %1 = call i32 @use_nocapture(ptr noundef nocapture %src)
+  %2 = load i32, ptr %src
+  store i32 %2, ptr %dest
+  %3 = call i32 @use_nocapture(ptr noundef nocapture %dest)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false)
+  %4 = call i32 @use_nocapture(ptr noundef nocapture %src)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest)
+  ret void
+}
+
+; Tests that we don't incorrectly try to perform the optimization for allocas
+; that succeed llvm.stacksave intrinsics, by restricting our optimization to
+; allocas that were defined at the start of the entry block.
+define void @stacksave() {
+; CHECK-LABEL: @stacksave(
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.stacksave()
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false)
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[DEST]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca %struct.Foo, align 4
+  %1 = call ptr @llvm.stacksave()
+  %dest = alloca %struct.Foo, align 4
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src)
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false)
+  %2 = call i32 @use_nocapture(ptr noundef nocapture %src)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src)
+  %3 = call i32 @use_nocapture(ptr noundef nocapture %dest)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest)
+  ret void
+}
+
+; Tests that the optimization fails if too many basic blocks are examined.
+define void @too_many_basic_blocks() {
+; CHECK-LABEL: @too_many_basic_blocks(
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    br label [[BB0:%.*]]
+; CHECK:       bb0:
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB2:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb3:
+; CHECK-NEXT:    br label [[BB4:%.*]]
+; CHECK:       bb4:
+; CHECK-NEXT:    br label [[BB5:%.*]]
+; CHECK:       bb5:
+; CHECK-NEXT:    br label [[BB6:%.*]]
+; CHECK:       bb6:
+; CHECK-NEXT:    br label [[BB7:%.*]]
+; CHECK:       bb7:
+; CHECK-NEXT:    br label [[BB8:%.*]]
+; CHECK:       bb8:
+; CHECK-NEXT:    br label [[BB9:%.*]]
+; CHECK:       bb9:
+; CHECK-NEXT:    br label [[BB10:%.*]]
+; CHECK:       bb10:
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[DEST]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca %struct.Foo, align 4
+  %dest = alloca %struct.Foo, align 4
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %src)
+  call void @llvm.lifetime.start.p0(i64 12, ptr nocapture %dest)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %src, ptr align 4 @constant, i64 12, i1 false)
+  %1 = call i32 @use_nocapture(ptr noundef nocapture %src)
+  br label %bb0
+
+bb0:
+  br label %bb1
+bb1:
+  br label %bb2
+bb2:
+  br label %bb3
+bb3:
+  br label %bb4
+bb4:
+  br label %bb5
+bb5:
+  br label %bb6
+bb6:
+  br label %bb7
+bb7:
+  br label %bb8
+bb8:
+  br label %bb9
+bb9:
+  br label %bb10
+
+bb10:
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %src)
+  %2 = call i32 @use_nocapture(ptr noundef nocapture %dest)
+  call void @llvm.lifetime.end.p0(i64 12, ptr nocapture %dest)
+  ret void
+}
+
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
+declare void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
+declare void @llvm.memcpy.p2.p1.i64(ptr addrspace(2) noalias nocapture writeonly, ptr addrspace(1) noalias nocapture readonly, i64, i1 immarg)
+declare void @llvm.memmove.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg)
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
+declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
+declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
+declare void @llvm.lifetime.start.p1(i64, ptr addrspace(1) nocapture)
+declare void @llvm.lifetime.end.p1(i64, ptr addrspace(1) nocapture)
+declare void @llvm.lifetime.start.p2(i64, ptr addrspace(2) nocapture)
+declare void @llvm.lifetime.end.p2(i64, ptr addrspace(2) nocapture)
+declare ptr @llvm.stacksave()
+
+declare i32 @use_nocapture(ptr noundef nocapture)
+declare i32 @use_nocapture_p1(ptr addrspace(1) noundef nocapture)
+declare i32 @use_nocapture_p2(ptr addrspace(2) noundef nocapture)
+declare i32 @use_capture(ptr noundef)
+declare i1 @cond()
+
+; Scope domain
+!0 = !{!0}
+; Scope in that domain
+!1 = !{!1, !0}
+; Scope list
+!2 = !{!1}
+
+!3 = !{!"Whatever"}
+