diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -29,6 +29,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
@@ -169,9 +170,9 @@
   findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
 
   // Find an instruction that updates the base register of the ld/st
-  // instruction.
-  bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
-                            unsigned BaseReg, int Offset);
+  // instruction. Return update Offset or 0 if not found.
+  int isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
+                           unsigned BaseReg, int Offset);
 
   // Merge a pre- or post-index base register update into a ld/st instruction.
   MachineBasicBlock::iterator
@@ -1698,9 +1699,9 @@
   return NextI;
 }
 
-bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
-                                               MachineInstr &MI,
-                                               unsigned BaseReg, int Offset) {
+int AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
+                                              MachineInstr &MI,
+                                              unsigned BaseReg, int Offset) {
   switch (MI.getOpcode()) {
   default:
     break;
@@ -1739,10 +1740,10 @@
     // If we have a non-zero Offset, we check that it matches the amount
     // we're adding to the register.
     if (!Offset || Offset == UpdateOffset)
-      return true;
+      return UpdateOffset;
     break;
   }
-  return false;
+  return 0;
 }
 
 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
@@ -1780,6 +1781,23 @@
   ModifiedRegUnits.clear();
   UsedRegUnits.clear();
   ++MBBI;
+
+  // We can't post-increment the stack pointer if any instruction between
+  // the memory access (I) and the increment (MBBI) can access the memory
+  // region defined by [SP, MBBI]. Premature stack
+  const bool BaseRegSP = BaseReg == AArch64::SP;
+
+  if (BaseRegSP) {
+    // FIXME: For now, we always block the optimization over SP in windows
+    // targets as it requires to adjust the unwind/debug info, messing up
+    // the unwind info can actually cause a miscompile.
+    const MCAsmInfo *MAI = I->getMF()->getTarget().getMCAsmInfo();
+    if (MAI->usesWindowsCFI() &&
+        I->getMF()->getFunction().needsUnwindTableEntry())
+      return E;
+  }
+
+  bool MayAccessStack = false;
   for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
     MachineInstr &MI = *MBBI;
 
@@ -1789,8 +1807,13 @@
       ++Count;
 
     // If we found a match, return it.
-    if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset))
+    const int Offset = isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset);
+    if (Offset) {
+      // Prevent premature stack-popping
+      if (MayAccessStack && Offset > 0)
+        continue;
       return MBBI;
+    }
 
     // Update the status of what the instruction clobbered and used.
     LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
@@ -1800,6 +1823,25 @@
     if (!ModifiedRegUnits.available(BaseReg) ||
         !UsedRegUnits.available(BaseReg))
       return E;
+
+    // FIXME: For now we conservativly detect any instruction that might access
+    // the stack. We should validate that the access is in the memory region of
+    // two memory operands, (e.g. [SP, MBBI]), but that requires a new may-alias
+    // API. May also need to track if other registers might point to the stack.
+    if (!MayAccessStack && BaseRegSP && MBBI->mayLoadOrStore()) {
+      if (MBBI->memoperands_empty()) {
+        MayAccessStack = true;
+        continue;
+      }
+      MachineFrameInfo *MFI = &(I->getMF()->getFrameInfo());
+      assert(MFI);
+      for (MachineMemOperand *MMO : MBBI->memoperands()) {
+        const PseudoSourceValue *PSVa = MMO->getPseudoValue();
+        MayAccessStack = (!PSVa) || PSVa->mayAlias(MFI);
+        if (MayAccessStack)
+          break;
+      }
+    }
   }
   return E;
 }
@@ -1844,7 +1886,7 @@
       ++Count;
 
     // If we found a match, return it.
-    if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset))
+    if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset) != 0)
       return MBBI;
 
     // Update the status of what the instruction clobbered and used.
diff --git a/llvm/test/CodeGen/AArch64/aarch64-ldst-no-premature-sp-pop.mir b/llvm/test/CodeGen/AArch64/aarch64-ldst-no-premature-sp-pop.mir
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-ldst-no-premature-sp-pop.mir
@@ -0,0 +1,85 @@
+# RUN: llc -start-before=aarch64-ldst-opt -o - %s | FileCheck %s
+# CHECK-NOT: stp     xzr, xzr, [sp], #16
+# CHECK:     add     sp, sp, #16
+--- |
+  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64-arm-none-eabi"
+  
+  define hidden i32 @foo(i32 %0) {
+    %2 = alloca [4 x i32], align 4
+    %3 = bitcast [4 x i32]* %2 to i8*
+    call void @llvm.memset.p0i8.i64(i8* nonnull align 4 dereferenceable(16) %3, i8 0, i64 16, i1 false)
+    %4 = sext i32 %0 to i64
+    %5 = getelementptr inbounds [4 x i32], [4 x i32]* %2, i64 0, i64 %4
+    %6 = load i32, i32* %5, align 4
+    ret i32 %6
+  }
+  
+  declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) #2
+  declare void @llvm.stackprotector(i8*, i8**) #3
+  
+  !llvm.module.flags = !{!0}
+  !llvm.ident = !{!1}
+  
+  !0 = !{i32 1, !"wchar_size", i32 4}
+  !1 = !{!"clang version 11.0.0 "}
+  !2 = !{!3, !3, i64 0}
+  !3 = !{!"int", !4, i64 0}
+  !4 = !{!"omnipotent char", !5, i64 0}
+  !5 = !{!"Simple C++ TBAA"}
+
+...
+---
+name:            foo
+alignment:       8
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:       []
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       16
+  offsetAdjustment: 0
+  maxAlignment:    8
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  16
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:
+  - { id: 0, type: default, offset: -16, size: 16, 
+      alignment: 8, stack-id: default, callee-saved-register: '', callee-saved-restored: true, 
+      local-offset: -16, debug-info-variable: '', debug-info-expression: '', 
+      debug-info-location: '' }
+callSites:       []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  bb.0 (%ir-block.1):
+    liveins: $w0
+  
+    $sp = frame-setup SUBXri $sp, 16, 0
+    $x8 = ADDXri $sp, 0, 0
+    STRXui $xzr, $sp, 1 :: (store 8 into %ir.3 + 8)
+    STRXui $xzr, $sp, 0 :: (store 8 into %ir.3)
+    renamable $w0 = LDRWroW killed renamable $x8, killed renamable $w0, 1, 1 :: (load 4 from %ir.5, !tbaa !2)
+    $sp = frame-destroy ADDXri $sp, 16, 0
+    RET undef $lr, implicit $w0
+
+...
diff --git a/llvm/test/CodeGen/AArch64/arm64-nvcast.ll b/llvm/test/CodeGen/AArch64/arm64-nvcast.ll
--- a/llvm/test/CodeGen/AArch64/arm64-nvcast.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-nvcast.ll
@@ -1,31 +1,41 @@
 ; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
 
-; CHECK-LABEL: _test:
-; CHECK-DAG:  fmov.2d v0, #2.00000000
-; CHECK-DAG: and [[MASK_IDX:x[0-9]+]], x1, #0x3
-; CHECK-DAG:  mov  x9, sp
-; CHECK-DAG:  str  q0, [sp], #16
-; CHECK-DAG:  bfi [[PTR:x[0-9]+]], [[MASK_IDX]], #2, #2
-; CHECK:  ldr s0, {{\[}}[[PTR]]{{\]}}
-; CHECK:  str  s0, [x0]
-
 define void @test(float * %p1, i32 %v1) {
+; CHECK-LABEL: test:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    sub sp, sp, #16 ; =16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ; kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    fmov.2d v0, #2.00000000
+; CHECK-NEXT:    and x8, x1, #0x3
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    str q0, [sp]
+; CHECK-NEXT:    bfi x9, x8, #2, #2
+; CHECK-NEXT:    ldr s0, [x9]
+; CHECK-NEXT:    str s0, [x0]
+; CHECK-NEXT:    add sp, sp, #16 ; =16
+; CHECK-NEXT:    ret
 entry:
   %v2 = extractelement <3 x float> <float 0.000000e+00, float 2.000000e+00, float 0.000000e+00>, i32 %v1
   store float %v2, float* %p1, align 4
   ret void
 }
 
-; CHECK-LABEL: _test2
-; CHECK: movi.16b  v0, #63
-; CHECK-DAG: and [[MASK_IDX:x[0-9]+]], x1, #0x3
-; CHECK-DAG: str  q0, [sp], #16
-; CHECK-DAG: mov  x9, sp
-; CHECK-DAG:  bfi [[PTR:x[0-9]+]], [[MASK_IDX]], #2, #2
-; CHECK: ldr s0, {{\[}}[[PTR]]{{\]}}
-; CHECK: str  s0, [x0]
-
 define void @test2(float * %p1, i32 %v1) {
+; CHECK-LABEL: test2:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    sub sp, sp, #16 ; =16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ; kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    movi.16b v0, #63
+; CHECK-NEXT:    and x8, x1, #0x3
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    str q0, [sp]
+; CHECK-NEXT:    bfi x9, x8, #2, #2
+; CHECK-NEXT:    ldr s0, [x9]
+; CHECK-NEXT:    str s0, [x0]
+; CHECK-NEXT:    add sp, sp, #16 ; =16
+; CHECK-NEXT:    ret
 entry:
   %v2 = extractelement <3 x float> <float 0.7470588088035583, float 0.7470588088035583, float 0.7470588088035583>, i32 %v1
   store float %v2, float* %p1, align 4
diff --git a/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll b/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll
--- a/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll
@@ -24,10 +24,12 @@
 %struct.S2 = type { i32, i32, i32, i32 }
 define dso_local [2 x i64] @"?f2"() {
 entry:
+; FIXME: Missed optimization, the entire SP push/pop could be removed
 ; CHECK-LABEL: f2
-; CHECK: stp xzr, xzr, [sp], #16
-; CHECK: mov x0, xzr
-; CHECK: mov x1, xzr
+; CHECK:         stp     xzr, xzr, [sp, #-16]!
+; CHECK-NEXT:    mov     x0, xzr
+; CHECK-NEXT:    mov     x1, xzr
+; CHECK-NEXT:    add     sp, sp, #16
 
   %retval = alloca %struct.S2, align 4
   %a = getelementptr inbounds %struct.S2, %struct.S2* %retval, i32 0, i32 0