Index: llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
===================================================================
--- llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
+++ llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
@@ -20,6 +20,7 @@
 namespace llvm {
 
 class AAResults;
+class AllocaInst;
 class BatchAAResults;
 class AssumptionCache;
 class CallBase;
@@ -77,6 +78,9 @@
   Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr,
                                     Value *ByteVal);
   bool moveUp(StoreInst *SI, Instruction *P, const LoadInst *LI);
+  bool performStackMoveOptzn(Instruction *Load, Instruction *Store,
+                             AllocaInst *DestAlloca, AllocaInst *SrcAlloca,
+                             uint64_t Size);
 
   void eraseInstruction(Instruction *I);
   bool iterateOnFunction(Function &F);
Index: llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
===================================================================
--- llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -69,6 +69,7 @@
 STATISTIC(NumMoveToCpy,   "Number of memmoves converted to memcpy");
 STATISTIC(NumCpyToSet,    "Number of memcpys converted to memset");
 STATISTIC(NumCallSlot,    "Number of call slot optimizations performed");
+STATISTIC(NumStackMove, "Number of stack-move optimizations performed");
 
 namespace {
 
@@ -730,6 +731,23 @@
     return true;
   }
 
+  // If this is a load-store pair from a stack slot to a stack slot, we
+  // might be able to perform the stack-move optimization just as we do for
+  // memcpys from an alloca to an alloca.
+  if (auto *DestAlloca = dyn_cast<AllocaInst>(SI->getPointerOperand())) {
+    if (auto *SrcAlloca = dyn_cast<AllocaInst>(LI->getPointerOperand())) {
+      if (performStackMoveOptzn(LI, SI, DestAlloca, SrcAlloca,
+                                DL.getTypeStoreSize(T))) {
+        // Avoid invalidating the iterator.
+        BBI = SI->getNextNonDebugInstruction()->getIterator();
+        eraseInstruction(SI);
+        eraseInstruction(LI);
+        ++NumMemCpyInstr;
+        return true;
+      }
+    }
+  }
+
   return false;
 }
 
@@ -1407,6 +1425,120 @@
 
   return true;
 }
+bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
+                                          AllocaInst *DestAlloca,
+                                          AllocaInst *SrcAlloca,
+                                          uint64_t Size) {
+  LLVM_DEBUG(dbgs() << "Stack Move: Attempting to optimize:\n"
+                    << *Store << "\n");
+
+  // Make sure the two allocas are in the same address space.
+  if (SrcAlloca->getAddressSpace() != DestAlloca->getAddressSpace()) {
+    LLVM_DEBUG(dbgs() << "Stack Move: Address space mismatch\n");
+    return false;
+  }
+
+  // 1. Check that copy is full
+
+  // Calculate the static size of the allocas to be merged, bailing out if we
+  // can't
+  const DataLayout &DL = DestAlloca->getModule()->getDataLayout();
+  std::optional<TypeSize> SrcSize = SrcAlloca->getAllocationSize(DL);
+  if (!SrcSize || SrcSize->isScalable() || Size != SrcSize->getFixedValue()) {
+    LLVM_DEBUG(dbgs() << "Stack Move: Source alloca size mismatch\n");
+    return false;
+  }
+  std::optional<TypeSize> DestSize = DestAlloca->getAllocationSize(DL);
+  if (!DestSize || DestSize->isScalable() ||
+      Size != DestSize->getFixedValue()) {
+    LLVM_DEBUG(dbgs() << "Stack Move: Destination alloca size mismatch\n");
+    return false;
+  }
+
+  // 2. Check that src and dest allocas is not captured
+  if (llvm::PointerMayBeCaptured(DestAlloca, /* ReturnCaptures=*/true,
+                                 /* StoreCaptures= */ true) ||
+      llvm::PointerMayBeCaptured(SrcAlloca, /* ReturnCaptures=*/true,
+                                 /* StoreCaptures= */ true))
+    return false;
+  // TODO: refine the before memcpy condition for dest alloca
+  // Bailout if any Mod except lifetime intrinsics exists before memcpy on the
+  // dest.
+  auto DestLoc = MemoryLocation(DestAlloca, LocationSize::precise(Size));
+  for (User *U : DestAlloca->users()) {
+    if (auto *I = dyn_cast<Instruction>(U)) {
+      if (!I->comesBefore(Store))
+        continue;
+      if (auto *II = dyn_cast<IntrinsicInst>(I);
+          II && II->isLifetimeStartOrEnd())
+        continue;
+      if (isModSet(AA->getModRefInfo(I, DestLoc)))
+        return false;
+    }
+  }
+
+  // 3. The src has no read and no write except lifetime intrinsics, from
+  // after the full copy to the end of the BB.
+  for (auto It = ++Store->getIterator(), E = Store->getParent()->end(); It != E;
+       ++It) {
+    // See all users of the SrcAlloca, between full copy and end of the BB.
+    llvm::Instruction &I = *It;
+    for (auto &V : I.operands()) {
+      auto *AI = dyn_cast<AllocaInst>(V);
+      if (AI == SrcAlloca)
+        if (auto *II = dyn_cast<IntrinsicInst>(&I);
+            !(II && II->isLifetimeStartOrEnd()))
+          return false;
+    }
+  }
+
+  // We can do the transformation. First, align the allocas appropriately.
+  SrcAlloca->setAlignment(
+      std::max(SrcAlloca->getAlign(), DestAlloca->getAlign()));
+
+  // Merge the two allocas.
+  DestAlloca->replaceAllUsesWith(SrcAlloca);
+
+  // Erase redundant lifetimes
+  IntrinsicInst *LifetimeStart = nullptr, *LifetimeEnd = nullptr;
+  SmallVector<IntrinsicInst *, 4> LifetimeToErase;
+  for (User *U : SrcAlloca->users()) {
+    if (auto *II = dyn_cast<IntrinsicInst>(U)) {
+      if (II->getIntrinsicID() == Intrinsic::lifetime_start) {
+        if (!LifetimeStart)
+          LifetimeStart = II;
+        else if (II->comesBefore(LifetimeStart)) {
+          LifetimeToErase.push_back(LifetimeStart);
+          LifetimeStart = II;
+        } else
+          LifetimeToErase.push_back(II);
+      } else if (II->getIntrinsicID() == Intrinsic::lifetime_end) {
+        if (!LifetimeEnd)
+          LifetimeEnd = II;
+        else if (LifetimeEnd->comesBefore(II)) {
+          LifetimeToErase.push_back(LifetimeEnd);
+          LifetimeEnd = II;
+        } else
+          LifetimeToErase.push_back(II);
+      }
+    }
+  }
+
+  // FIXME: still crash on erase
+  for (auto *II : make_early_inc_range(LifetimeToErase)) {
+    eraseInstruction(II);
+  }
+
+  // Drop metadata on the source alloca.
+  SrcAlloca->dropUnknownNonDebugMetadata();
+
+  // TODO: remove all noalias metadata from src alloca. or not ? Does this
+  // change the expectation if the noalias attributes are there?
+
+  LLVM_DEBUG(dbgs() << "Stack Move: Performed staack-move optimization\n");
+  NumStackMove++;
+  return false;
+}
 
 /// Perform simplification of memcpy's.  If we have memcpy A
 /// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite
@@ -1488,8 +1620,10 @@
           }
         }
       }
-      if (auto *MDep = dyn_cast<MemCpyInst>(MI))
-        return processMemCpyMemCpyDependence(M, MDep, BAA);
+      if (auto *MDep = dyn_cast<MemCpyInst>(MI)) {
+        if (processMemCpyMemCpyDependence(M, MDep, BAA))
+          return true;
+      }
       if (auto *MDep = dyn_cast<MemSetInst>(MI)) {
         if (performMemCpyToMemSetOptzn(M, MDep, BAA)) {
           LLVM_DEBUG(dbgs() << "Converted memcpy to memset\n");
@@ -1508,6 +1642,26 @@
     }
   }
 
+  // If the transfer is from a stack slot to a stack slot, then we may be able
+  // to perform the stack-move optimization. See the comments in
+  // performStackMoveOptzn() for more details.
+  AllocaInst *DestAlloca = dyn_cast<AllocaInst>(M->getDest());
+  if (DestAlloca == nullptr)
+    return false;
+  AllocaInst *SrcAlloca = dyn_cast<AllocaInst>(M->getSource());
+  if (SrcAlloca == nullptr)
+    return false;
+  ConstantInt *Len = dyn_cast<ConstantInt>(M->getLength());
+  if (Len == nullptr)
+    return false;
+  if (performStackMoveOptzn(M, M, DestAlloca, SrcAlloca, Len->getZExtValue())) {
+    // Avoid invalidating the iterator.
+    BBI = M->getNextNonDebugInstruction()->getIterator();
+    eraseInstruction(M);
+    ++NumMemCpyInstr;
+    return true;
+  }
+
   return false;
 }
 
Index: llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll
===================================================================
--- llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll
+++ llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll
@@ -1,11 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
 ; RUN: opt < %s -S -passes=memcpyopt | FileCheck --match-full-lines %s
 
 ; Alias scopes are merged by taking the intersection of domains, then the union of the scopes within those domains
 define i8 @test(i8 %input) {
+; CHECK-LABEL: define i8 @test
+; CHECK-SAME: (i8 [[INPUT:%.*]]) {
+; CHECK-NEXT:    [[TMP:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[DST:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[SRC:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[SRC]]), !noalias !0
+; CHECK-NEXT:    store i8 [[INPUT]], ptr [[SRC]], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[SRC]], ptr align 8 [[SRC]], i64 1, i1 false), !alias.scope !5
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[SRC]]), !noalias !0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[SRC]], ptr align 8 [[SRC]], i64 1, i1 false), !alias.scope !0
+; CHECK-NEXT:    [[RET_VALUE:%.*]] = load i8, ptr [[SRC]], align 1
+; CHECK-NEXT:    ret i8 [[RET_VALUE]]
+;
   %tmp = alloca i8
   %dst = alloca i8
   %src = alloca i8
-; CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %src, i64 1, i1 false), !alias.scope ![[SCOPE:[0-9]+]]
   call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %src), !noalias !4
   store i8 %input, ptr %src
   call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp, ptr align 8 %src, i64 1, i1 false), !alias.scope !0
@@ -16,9 +29,6 @@
 }
 
 ; Merged scope contains "callee0: %a" and "callee0 : %b"
-; CHECK-DAG: ![[CALLEE0_A:[0-9]+]] = distinct !{!{{[0-9]+}}, !{{[0-9]+}}, !"callee0: %a"}
-; CHECK-DAG: ![[CALLEE0_B:[0-9]+]] = distinct !{!{{[0-9]+}}, !{{[0-9]+}}, !"callee0: %b"}
-; CHECK-DAG: ![[SCOPE]] = !{![[CALLEE0_A]], ![[CALLEE0_B]]}
 
 declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
Index: llvm/test/Transforms/MemCpyOpt/callslot.ll
===================================================================
--- llvm/test/Transforms/MemCpyOpt/callslot.ll
+++ llvm/test/Transforms/MemCpyOpt/callslot.ll
@@ -60,7 +60,7 @@
 ; CHECK-NEXT:    [[SRC:%.*]] = alloca [16 x i8], align 1
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[SRC]], i8 0, i64 16, i1 false)
 ; CHECK-NEXT:    store i8 1, ptr [[SRC]], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[DEST]], ptr [[SRC]], i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[SRC]], ptr [[SRC]], i64 16, i1 false)
 ; CHECK-NEXT:    ret void
 ;
   %dest = alloca [16 x i8]
Index: llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll
===================================================================
--- llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll
+++ llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
 ; RUN: opt < %s -S -passes=memcpyopt | FileCheck --match-full-lines %s
 
 ; Make sure callslot optimization merges alias.scope metadata correctly when it merges instructions.
@@ -13,11 +14,23 @@
 ;  !5 = distinct !{!5, !"callee0"}
 ; Which is incorrect because the lifetime.end of %src will now "noalias" the above memcpy.
 define i8 @test(i8 %input) {
+; CHECK-LABEL: define i8 @test
+; CHECK-SAME: (i8 [[INPUT:%.*]]) {
+; CHECK-NEXT:    [[TMP:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[DST:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[SRC:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[SRC]]), !noalias !0
+; CHECK-NEXT:    store i8 [[INPUT]], ptr [[SRC]], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[SRC]], ptr align 8 [[SRC]], i64 1, i1 false), !alias.scope !3
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[SRC]]), !noalias !0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[SRC]], ptr align 8 [[SRC]], i64 1, i1 false), !alias.scope !0
+; CHECK-NEXT:    [[RET_VALUE:%.*]] = load i8, ptr [[SRC]], align 1
+; CHECK-NEXT:    ret i8 [[RET_VALUE]]
+;
   %tmp = alloca i8
   %dst = alloca i8
   %src = alloca i8
 ; NOTE: we're matching the full line and looking for the lack of !alias.scope here
-; CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %src, i64 1, i1 false)
   call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %src), !noalias !3
   store i8 %input, ptr %src
   call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp, ptr align 8 %src, i64 1, i1 false), !alias.scope !0
Index: llvm/test/Transforms/MemCpyOpt/memcpy.ll
===================================================================
--- llvm/test/Transforms/MemCpyOpt/memcpy.ll
+++ llvm/test/Transforms/MemCpyOpt/memcpy.ll
@@ -599,6 +599,7 @@
 ; CHECK-LABEL: @immut_param_enforced_alignment(
 ; CHECK-NEXT:    [[VAL:%.*]] = alloca i8, align 4
 ; CHECK-NEXT:    store i32 42, ptr [[VAL]], align 4
+; CHECK-NEXT:    [[VAL1:%.*]] = alloca i8, align 4
 ; CHECK-NEXT:    call void @f(ptr noalias nocapture readonly [[VAL]])
 ; CHECK-NEXT:    ret void
 ;
@@ -647,6 +648,7 @@
 ; CHECK-LABEL: @immut_unescaped_alloca(
 ; CHECK-NEXT:    [[VAL:%.*]] = alloca i8, align 4
 ; CHECK-NEXT:    store i32 42, ptr [[VAL]], align 4
+; CHECK-NEXT:    [[VAL1:%.*]] = alloca i8, align 4
 ; CHECK-NEXT:    call void @f_full_readonly(ptr [[VAL]])
 ; CHECK-NEXT:    ret void
 ;
Index: llvm/test/Transforms/MemCpyOpt/stack-move.ll
===================================================================
--- llvm/test/Transforms/MemCpyOpt/stack-move.ll
+++ llvm/test/Transforms/MemCpyOpt/stack-move.ll
@@ -27,13 +27,12 @@
 ; CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
 ; CHECK-NEXT:    [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]])
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]])
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false)
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 [[SRC]], i64 12, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]])
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[DEST]])
 ; CHECK-NEXT:    ret void
 ;
   %src = alloca %struct.Foo, align 4
@@ -56,13 +55,11 @@
 ; CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
 ; CHECK-NEXT:    [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]])
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]])
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false)
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false)
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]])
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[DEST]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]])
 ; CHECK-NEXT:    ret void
 ;
   %src = alloca %struct.Foo, align 4
@@ -85,14 +82,13 @@
 ; CHECK-NEXT:    [[SRC:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[DEST:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nocapture [[SRC]])
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nocapture [[DEST]])
 ; CHECK-NEXT:    store i32 42, ptr [[SRC]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[SRC]], align 4
-; CHECK-NEXT:    store i32 [[TMP2]], ptr [[DEST]], align 4
+; CHECK-NEXT:    store i32 [[TMP2]], ptr [[SRC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nocapture [[SRC]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nocapture [[SRC]])
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nocapture [[DEST]])
 ; CHECK-NEXT:    ret void
 ;
   %src = alloca i32, align 4
@@ -117,13 +113,12 @@
 ; CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 8
 ; CHECK-NEXT:    [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]])
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]])
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false)
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 [[SRC]], i64 12, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]])
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[DEST]])
 ; CHECK-NEXT:    ret void
 ;
   %src = alloca %struct.Foo, align 8
@@ -147,18 +142,17 @@
 ; CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
 ; CHECK-NEXT:    [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]])
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]])
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false)
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 [[SRC]], i64 12, i1 false)
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]])
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]])
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]])
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]])
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[DEST]])
 ; CHECK-NEXT:    ret void
 ;
   %src = alloca %struct.Foo, align 4
@@ -179,20 +173,19 @@
   ret void
 }
 
-; TODO: Remove metadata and merge
+; TODO: remove metadata
 ; Tests that we remove scoped noalias metadata from a call.
 define void @remove_scoped_noalias() {
 ; CHECK-LABEL: define void @remove_scoped_noalias() {
 ; CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
 ; CHECK-NEXT:    [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]])
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]])
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false)
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]), !alias.scope !0
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 [[SRC]], i64 12, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]), !noalias !0
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]])
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]]), !noalias !0
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[DEST]])
 ; CHECK-NEXT:    ret void
 ;
   %src = alloca %struct.Foo, align 4
@@ -212,16 +205,15 @@
 ; Tests that we remove metadata on the merged alloca.
 define void @remove_alloca_metadata() {
 ; CHECK-LABEL: define void @remove_alloca_metadata() {
-; CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4, !annotation !3
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
 ; CHECK-NEXT:    [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]])
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]])
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 @constant, i64 12, i1 false)
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]), !alias.scope !0
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 [[SRC]], i64 12, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]), !noalias !0
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]])
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]]), !noalias !0
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[DEST]])
 ; CHECK-NEXT:    ret void
 ;
   %src = alloca %struct.Foo, align 4, !annotation !3
Index: llvm/test/Transforms/MemCpyOpt/stackrestore.ll
===================================================================
--- llvm/test/Transforms/MemCpyOpt/stackrestore.ll
+++ llvm/test/Transforms/MemCpyOpt/stackrestore.ll
@@ -56,10 +56,10 @@
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr [[ARGMEM]], ptr align 1 @str, i32 9, i1 false)
 ; CHECK-NEXT:    [[P10:%.*]] = getelementptr inbounds [10 x i8], ptr [[ARGMEM]], i32 0, i32 9
 ; CHECK-NEXT:    store i8 0, ptr [[P10]], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr [[TMPMEM]], ptr [[ARGMEM]], i32 10, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr [[ARGMEM]], ptr [[ARGMEM]], i32 10, i1 false)
 ; CHECK-NEXT:    call void @llvm.stackrestore(ptr [[INALLOCA_SAVE]])
 ; CHECK-NEXT:    [[HEAP:%.*]] = call ptr @malloc(i32 9)
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr [[HEAP]], ptr [[TMPMEM]], i32 9, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr [[HEAP]], ptr [[ARGMEM]], i32 9, i1 false)
 ; CHECK-NEXT:    call void @useit(ptr [[HEAP]])
 ; CHECK-NEXT:    ret i32 0
 ;