diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -148,6 +148,18 @@
                                    const StackOffset &OffsetFromDefCFA) const;
   bool shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock &MBB,
                                                 unsigned StackBumpBytes) const;
+
+  /// Replace a StackProbe stub (if any) with the actual probe code inline
+  virtual void inlineStackProbe(MachineFunction &MF,
+                                MachineBasicBlock &PrologueMBB) const override;
+  MachineBasicBlock::iterator
+  inlineStackProbeFixed(MachineBasicBlock::iterator MBBI) const;
+  MachineBasicBlock::iterator
+  inlineStackProbeVar(MachineBasicBlock::iterator MBBI) const;
+  MachineBasicBlock::iterator
+  inlineStackProbeLoopExactMultiple(MachineBasicBlock::iterator MBBI,
+                                    int64_t NegProbeSize,
+                                    Register TargetReg) const;
 };
 
 } // End llvm namespace
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1091,6 +1091,7 @@
   const Function &F = MF.getFunction();
   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+  const AArch64TargetLowering &TLI = *Subtarget.getTargetLowering();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineModuleInfo &MMI = MF.getMMI();
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
@@ -1347,12 +1348,13 @@
     NumBytes = 0;
   }
 
-  StackOffset AllocateBefore = SVEStackSize, AllocateAfter = {};
+  StackOffset AllocateBefore = {}, AllocateAfter = SVEStackSize;
   MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI;
 
   // Process the SVE callee-saves to determine what space needs to be
   // allocated.
   if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
+    LLVM_DEBUG(dbgs() << "SVECalleeSavedStackSize = " << CalleeSavedSize << "\n");
     // Find callee save instructions in frame.
     CalleeSavesBegin = MBBI;
     assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
@@ -1363,22 +1365,42 @@
     AllocateBefore = StackOffset::getScalable(CalleeSavedSize);
     AllocateAfter = SVEStackSize - AllocateBefore;
   }
+  LLVM_DEBUG(dbgs() << "AllocateBefore = " << AllocateBefore.getFixed() << ", "
+                    << AllocateBefore.getScalable() << "\n");
+  LLVM_DEBUG(dbgs() << "AllocateAfter = " << AllocateAfter.getFixed() << ", "
+                    << AllocateAfter.getScalable() << "\n");
 
-  // Allocate space for the callee saves (if any).
+  // Allocate space for the SVE callee saves (if any).
+  // This space doesn't need stack probing, because it will all be written to
+  // when saving the CSRs.
   emitFrameOffset(MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP,
                   -AllocateBefore, TII,
                   MachineInstr::FrameSetup);
 
   // Finally allocate remaining SVE stack space.
-  emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP,
-                  -AllocateAfter, TII,
-                  MachineInstr::FrameSetup);
+  if (TLI.hasInlineStackProbe(MF) && AllocateAfter) {
+    Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB);
+    assert(ScratchReg != AArch64::NoRegister);
+    emitFrameOffset(MBB, CalleeSavesEnd, DL, ScratchReg, AArch64::SP,
+                    -AllocateAfter, TII, MachineInstr::FrameSetup);
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::PROBED_STACKALLOC_VAR))
+        .addUse(ScratchReg);
+  } else {
+    emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP,
+                    -AllocateAfter, TII, MachineInstr::FrameSetup);
+  }
 
   // Allocate space for the rest of the frame.
   if (NumBytes) {
     // Alignment is required for the parent frame, not the funclet
     const bool NeedsRealignment =
         !IsFunclet && RegInfo->needsStackRealignment(MF);
+    bool NeedsStackProbe = TLI.hasInlineStackProbe(MF) &&
+                           (NumBytes >= TLI.getStackProbeMaxUnprobedStack(MF) ||
+                            MFI.hasVarSizedObjects());
+    if (NeedsRealignment)
+      NeedsStackProbe |= TLI.hasInlineStackProbe(MF) &&
+          (NumBytes + MFI.getMaxAlign().value()) >= TLI.getStackProbeMaxUnprobedStack(MF);
     unsigned scratchSPReg = AArch64::SP;
 
     if (NeedsRealignment) {
@@ -1387,13 +1409,26 @@
     }
 
     // If we're a leaf function, try using the red zone.
-    if (!canUseRedZone(MF))
+    if (!canUseRedZone(MF)) {
       // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
       // the correct value here, as NumBytes also includes padding bytes,
       // which shouldn't be counted here.
-      emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP,
-                      StackOffset::getFixed(-NumBytes), TII,
-                      MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
+      if (NeedsStackProbe && !NeedsRealignment) {
+        // If we don't need to re-align the stack, we can use a more efficient
+        // sequence for stack probing.
+        Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB);
+        assert(ScratchReg != AArch64::NoRegister);
+        BuildMI(MBB, MBBI, DL,
+                TII->get(AArch64::PROBED_STACKALLOC))
+            .addDef(ScratchReg)
+            .addImm(-NumBytes);
+      } else {
+        emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP,
+                        StackOffset::getFixed(-NumBytes), TII,
+                        MachineInstr::FrameSetup, false, NeedsWinCFI,
+                        &HasWinCFI);
+      }
+    }
 
     if (NeedsRealignment) {
       const unsigned NrBitsToZero = Log2(MFI.getMaxAlign());
@@ -1411,9 +1446,17 @@
                                 | ((64 - NrBitsToZero) << 6)      // immr
                                 | ((64 - NrBitsToZero - 1) << 0); // imms
 
-      BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
-          .addReg(scratchSPReg, RegState::Kill)
-          .addImm(andMaskEncoded);
+      if (NeedsStackProbe) {
+        BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), scratchSPReg)
+            .addReg(scratchSPReg, RegState::Kill)
+            .addImm(andMaskEncoded);
+        BuildMI(MBB, MBBI, DL, TII->get(AArch64::PROBED_STACKALLOC_VAR))
+            .addUse(scratchSPReg);
+      } else {
+        BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
+            .addReg(scratchSPReg, RegState::Kill)
+            .addImm(andMaskEncoded);
+      }
       AFI->setStackRealigned(true);
       if (NeedsWinCFI) {
         HasWinCFI = true;
@@ -3580,3 +3623,156 @@
     dbgs() << "\n";
   });
 }
+
+/// Emit a loop to decrement SP until it is equal to TargetReg, with probes at
+/// least every NegProbeSize bytes. Returns an iterator of the first instruction
+/// after the loop. The difference between SP and TargetReg must be an exact
+/// multiple of NegProbeSize.
+MachineBasicBlock::iterator
+AArch64FrameLowering::inlineStackProbeLoopExactMultiple(
+    MachineBasicBlock::iterator MBBI, int64_t NegProbeSize,
+    Register TargetReg) const {
+  MachineBasicBlock &MBB = *MBBI->getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const AArch64InstrInfo *TII =
+      MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+  MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
+  MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, LoopMBB);
+  MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, ExitMBB);
+
+  // ADD SP, SP, #NegFrameSize (or equivalent if NegFrameSize is not encodable
+  // in ADD).
+  emitFrameOffset(*LoopMBB, LoopMBB->end(), DL, AArch64::SP, AArch64::SP,
+                  StackOffset::getFixed(NegProbeSize), TII,
+                  MachineInstr::FrameSetup);
+  // STR XZR, [SP]
+  BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::STRXui))
+      .addReg(AArch64::XZR)
+      .addReg(AArch64::SP)
+      .addImm(0)
+      .setMIFlags(MachineInstr::FrameSetup);
+  // CMP SP, XZR
+  BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
+          AArch64::XZR)
+      .addReg(AArch64::SP)
+      .addReg(TargetReg)
+      .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
+      .setMIFlags(MachineInstr::FrameSetup);
+  // B.CC Loop
+  BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::Bcc))
+      .addImm(AArch64CC::NE)
+      .addMBB(LoopMBB)
+      .setMIFlags(MachineInstr::FrameSetup);
+
+  LoopMBB->addSuccessor(ExitMBB);
+  LoopMBB->addSuccessor(LoopMBB);
+  // Synthesize the exit MBB.
+  ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
+  ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+  MBB.addSuccessor(LoopMBB);
+  // Update liveins.
+  recomputeLiveIns(*LoopMBB);
+  recomputeLiveIns(*ExitMBB);
+
+  return ExitMBB->begin();
+}
+
+MachineBasicBlock::iterator AArch64FrameLowering::inlineStackProbeFixed(
+    MachineBasicBlock::iterator MBBI) const {
+  MachineBasicBlock &MBB = *MBBI->getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const AArch64TargetLowering *TLI =
+      MF.getSubtarget<AArch64Subtarget>().getTargetLowering();
+  const AArch64InstrInfo *TII =
+      MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
+
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+  Register ScratchReg = MBBI->getOperand(0).getReg();
+  int64_t NegFrameSize = MBBI->getOperand(1).getImm();
+  int64_t NegProbeSize = -(int64_t)TLI->getStackProbeSize(MF);
+  int64_t NumBlocks = NegFrameSize / NegProbeSize;
+  int64_t NegResidualSize = NegFrameSize % NegProbeSize;
+  MachineBasicBlock::iterator NextInst;
+  LLVM_DEBUG(dbgs() << "Stack probing: total " << NegFrameSize << " bytes, " <<
+      NumBlocks << " blocks of " << NegProbeSize << " bytes, plus " <<
+      NegResidualSize << " bytes\n");
+
+  if (NegResidualSize != 0) {
+    // ADD SP, SP, #NegFrameSize (or equivalent if NegFrameSize is not encodable
+    // in ADD).
+    emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
+                    StackOffset::getFixed(NegResidualSize), TII, MachineInstr::FrameSetup);
+    // STR XZR, [SP]
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::STRXui))
+        .addReg(AArch64::XZR)
+        .addReg(AArch64::SP)
+        .addImm(0)
+        .setMIFlags(MachineInstr::FrameSetup);
+  }
+
+  if (NumBlocks < 3) {
+    for (int i = 0; i < NumBlocks; ++i) {
+      // ADD SP, SP, #NegFrameSize (or equivalent if NegFrameSize is not
+      // encodable in ADD).
+      emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
+                      StackOffset::getFixed(NegProbeSize), TII, MachineInstr::FrameSetup);
+      // STR XZR, [SP]
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::STRXui))
+          .addReg(AArch64::XZR)
+          .addReg(AArch64::SP)
+          .addImm(0)
+          .setMIFlags(MachineInstr::FrameSetup);
+    }
+    NextInst = std::next(MBBI);
+  } else if (NumBlocks != 0) {
+    // ADD ScratchReg, SP, #NegFrameSize (or equivalent if NegFrameSize is not
+    // encodable in ADD).
+    emitFrameOffset(MBB, MBBI, DL, ScratchReg, AArch64::SP,
+                    StackOffset::getFixed(NegProbeSize * NumBlocks), TII,
+                    MachineInstr::FrameSetup);
+    NextInst =
+        inlineStackProbeLoopExactMultiple(MBBI, NegProbeSize, ScratchReg);
+  }
+
+  MBBI->eraseFromParent();
+  return NextInst;
+}
+
+MachineBasicBlock::iterator AArch64FrameLowering::inlineStackProbeVar(
+    MachineBasicBlock::iterator MBBI) const {
+  MachineBasicBlock &MBB = *MBBI->getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const AArch64TargetLowering *TLI =
+      MF.getSubtarget<AArch64Subtarget>().getTargetLowering();
+  const AArch64InstrInfo *TII =
+      MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
+
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+  Register TargetReg = MBBI->getOperand(0).getReg();
+  int64_t NegProbeSize = -(int64_t)TLI->getStackProbeSize(MF);
+  MachineBasicBlock::iterator NextInst = std::next(MBBI);
+
+  NextInst = TII->insertStackProbingLoop(MBBI, NegProbeSize, TargetReg);
+
+  MBBI->eraseFromParent();
+  return NextInst;
+}
+
+void AArch64FrameLowering::inlineStackProbe(
+    MachineFunction &MF, MachineBasicBlock &MBB) const {
+  for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E;) {
+    if (MBBI->getOpcode() == AArch64::PROBED_STACKALLOC) {
+      MBBI = inlineStackProbeFixed(MBBI);
+      E = MBBI->getParent()->end();
+    } else if (MBBI->getOpcode() == AArch64::PROBED_STACKALLOC_VAR) {
+      MBBI = inlineStackProbeVar(MBBI);
+      E = MBBI->getParent()->end();
+    } else {
+      ++MBBI;
+    }
+  }
+}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -815,6 +815,17 @@
     return 128;
   }
 
+  /// True if stack clash protection is enabled for this functions.
+  bool hasInlineStackProbe(MachineFunction &MF) const override;
+
+  /// Get the interval between stack-clash probes, which is equal to the stack
+  /// guard size, in bytes.
+  unsigned getStackProbeSize(MachineFunction &MF) const;
+
+  /// Get the maximum allowed number of unprobed bytes above SP at an ABI
+  /// boundary.
+  unsigned getStackProbeMaxUnprobedStack(MachineFunction &MF) const;
+
 private:
   /// Keep a pointer to the AArch64Subtarget around so that we can
   /// make the right decision when generating code for different targets.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16623,6 +16623,45 @@
   return TargetLowering::getSSPStackGuardCheck(M);
 }
 
+bool AArch64TargetLowering::hasInlineStackProbe(MachineFunction &MF) const {
+  // If the function specifically requests inline stack probes, emit them.
+  if (MF.getFunction().hasFnAttribute("probe-stack")) {
+    if (MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
+        "inline-asm")
+      return true;
+    else
+      llvm_unreachable("Unsupported stack probing method");
+  }
+
+  return false;
+}
+
+unsigned AArch64TargetLowering::getStackProbeSize(MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = Subtarget->getFrameLowering();
+  unsigned StackAlign = TFI->getStackAlignment();
+  assert(StackAlign >= 1 && isPowerOf2_32(StackAlign) &&
+         "Unexpected stack alignment");
+  // The default stack probe size is 4096 if the function has no
+  // stack-probe-size attribute. This is a safe default because it is the
+  // smallest possible guard page size.
+  unsigned StackProbeSize = 4096;
+  const Function &Fn = MF.getFunction();
+  if (Fn.hasFnAttribute("stack-probe-size"))
+    Fn.getFnAttribute("stack-probe-size")
+        .getValueAsString()
+        .getAsInteger(0, StackProbeSize);
+  // Round down to the stack alignment.
+  StackProbeSize &= ~(StackAlign - 1);
+  return StackProbeSize ? StackProbeSize : StackAlign;
+}
+
+unsigned AArch64TargetLowering::getStackProbeMaxUnprobedStack(
+    MachineFunction &MF) const {
+  // We assume and guarantee that, at an ABI boundary, the last probe was no
+  // more than 1024 bytes above SP.
+  return 1024;
+}
+
 Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
   // Android provides a fixed TLS slot for the SafeStack pointer. See the
   // definition of TLS_SLOT_SAFESTACK in
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -311,6 +311,14 @@
   static void decomposeStackOffsetForDwarfOffsets(const StackOffset &Offset,
                                                   int64_t &ByteSized,
                                                   int64_t &VGSized);
+
+  /// Insert code to set SP to the value in TargetReg, ensuring that memory is
+  /// writen to every NegProbeSize bytes. TargetReg must be below SP, and has no
+  /// alignment requirements other then the usual 16-byte alignment for SP.
+  MachineBasicBlock::iterator
+  insertStackProbingLoop(MachineBasicBlock::iterator MBBI, int64_t NegProbeSize,
+                         Register TargetReg) const;
+
 #define GET_INSTRINFO_HELPER_DECLS
 #include "AArch64GenInstrInfo.inc"
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -7215,6 +7215,90 @@
     return AArch64::BLR;
 }
 
+MachineBasicBlock::iterator
+AArch64InstrInfo::insertStackProbingLoop(MachineBasicBlock::iterator MBBI,
+                                         int64_t NegProbeSize,
+                                         Register TargetReg) const {
+  MachineBasicBlock &MBB = *MBBI->getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const AArch64InstrInfo *TII =
+      MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+  MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
+  MachineBasicBlock *LoopTestMBB =
+      MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, LoopTestMBB);
+  MachineBasicBlock *LoopBodyMBB =
+      MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, LoopBodyMBB);
+  MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, ExitMBB);
+
+  // LoopTest:
+  //   SUB SP, SP, #ProbeSize
+  emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP,
+                  AArch64::SP, StackOffset::getFixed(NegProbeSize), TII,
+                  MachineInstr::FrameSetup);
+
+  //   CMP SP, TargetReg
+  BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
+          AArch64::XZR)
+      .addReg(AArch64::SP)
+      .addReg(TargetReg)
+      .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
+      .setMIFlags(MachineInstr::FrameSetup);
+
+  //   B.LE LoopExit
+  BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc))
+      .addImm(AArch64CC::LE)
+      .addMBB(ExitMBB)
+      .setMIFlags(MachineInstr::FrameSetup);
+
+  //   STR XZR, [SP]
+  BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui))
+      .addReg(AArch64::XZR)
+      .addReg(AArch64::SP)
+      .addImm(0)
+      .setMIFlags(MachineInstr::FrameSetup);
+
+  //   B loop
+  BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B))
+      .addMBB(LoopTestMBB);
+
+  // LoopExit:
+  //   MOV SP, TargetReg
+  BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP)
+      .addReg(TargetReg)
+      .addImm(0)
+      .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
+      .setMIFlags(MachineInstr::FrameSetup);
+
+  //   STR XZR, [SP]
+  BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::STRXui))
+      .addReg(AArch64::XZR)
+      .addReg(AArch64::SP)
+      .addImm(0)
+      .setMIFlags(MachineInstr::FrameSetup);
+
+  LoopTestMBB->addSuccessor(ExitMBB);
+  LoopTestMBB->addSuccessor(LoopBodyMBB);
+  LoopBodyMBB->addSuccessor(LoopTestMBB);
+  // Synthesize the exit MBB.
+  ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
+  ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+  MBB.addSuccessor(LoopTestMBB);
+
+  // Update liveins.
+  if (MF.getRegInfo().reservedRegsFrozen()) {
+    recomputeLiveIns(*LoopTestMBB);
+    recomputeLiveIns(*LoopBodyMBB);
+    recomputeLiveIns(*ExitMBB);
+  }
+
+  return ExitMBB->begin();
+}
+
 #define GET_INSTRINFO_HELPERS
 #define GET_INSTRMAP_INFO
 #include "AArch64GenInstrInfo.inc"
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -653,6 +653,21 @@
 def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
                             [(AArch64callseq_end timm:$amt1, timm:$amt2)]>,
                             Sched<[]>;
+
+// Probed stack allocation of a constant size, used in function prologues when
+// stack-clash protection is enabled.
+def PROBED_STACKALLOC : Pseudo<(outs GPR64:$scratch),
+                               (ins i64imm:$stacksize),
+                               []>,
+                               Sched<[]>;
+// Probed stack allocation of a variable size, used in function prologues when
+// stack-clash protection is enabled. The register input is the target SP,
+// which should be below the current value, and has no alignment requirements
+// beyond the usual 16-byte alignment for SP.
+def PROBED_STACKALLOC_VAR : Pseudo<(outs),
+                                   (ins GPR64:$target),
+                                   []>,
+                                   Sched<[]>;
 } // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1
 
 let isReMaterializable = 1, isCodeGenOnly = 1 in {
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-64k.ll b/llvm/test/CodeGen/AArch64/stack-probing-64k.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/stack-probing-64k.ll
@@ -0,0 +1,289 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s
+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -global-isel | FileCheck %s
+
+; Tests for prolog sequences for stack probing, when using a 64KiB stack guard.
+
+; Small stack frame, no probing required.
+define void @static_64(i8** %out) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" {
+; CHECK-LABEL: static_64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #64 // =64
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #64 // =64
+; CHECK-NEXT:    ret
+entry:
+  %vla = alloca i8, i64 64, align 1
+  store i8* %vla, i8** %out, align 8
+  ret void
+}
+
+; At 256 bytes we start to always create a frame pointer. No frame smaller then
+; this needs a probe, so we can use the saving of at least one CSR as a probe
+; at the top of our frame.
+define void @static_256(i8** %out) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" {
+; CHECK-LABEL: static_256:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #272 // =272
+; CHECK-NEXT:    str x29, [sp, #256] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 272
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #272 // =272
+; CHECK-NEXT:    ret
+entry:
+  %vla = alloca i8, i64 256, align 1
+  store i8* %vla, i8** %out, align 8
+  ret void
+}
+
+; At just less that 1024 bytes, this is the largest frame which doesn't need
+; probing.
+define void @static_1008(i8** %out) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" {
+; CHECK-LABEL: static_1008:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1008 // =1008
+; CHECK-NEXT:    .cfi_def_cfa_offset 1024
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #1008 // =1008
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %vla = alloca i8, i64 1008, align 1
+  store i8* %vla, i8** %out, align 8
+  ret void
+}
+
+; At 1024 bytes, we need to start probing to guarantee that SP does not go too
+; far into the guard at an ABI boundary.
+define void @static_1024(i8** %out) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" {
+; CHECK-LABEL: static_1024:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1024 // =1024
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    .cfi_def_cfa_offset 1040
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #1024 // =1024
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %vla = alloca i8, i64 1024, align 1
+  store i8* %vla, i8** %out, align 8
+  ret void
+}
+
+; The stack offset does not fit into one SUB unstruction, so two are used.
+define void @static_65520(i8** %out) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" {
+; CHECK-LABEL: static_65520:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #15, lsl #12 // =61440
+; CHECK-NEXT:    sub sp, sp, #4080 // =4080
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    .cfi_def_cfa_offset 65536
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #15, lsl #12 // =61440
+; CHECK-NEXT:    add sp, sp, #4080 // =4080
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %vla = alloca i8, i64 65520, align 1
+  store i8* %vla, i8** %out, align 8
+  ret void
+}
+
+
+; 64k bytes is the largest frame we can probe in one go.
+define void @static_65536(i8** %out) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" {
+; CHECK-LABEL: static_65536:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    .cfi_def_cfa_offset 65552
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %vla = alloca i8, i64 65536, align 1
+  store i8* %vla, i8** %out, align 8
+  ret void
+}
+
+; Smallest stack frame (64k+16) which needs two probes (the first, smaller one gets
+; folded into a store with writeback).
+define void @static_65552(i8** %out) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" {
+; CHECK-LABEL: static_65552:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    str xzr, [sp, #-16]!
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    .cfi_def_cfa_offset 65568
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    add sp, sp, #16 // =16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %vla = alloca i8, i64 65552, align 1
+  store i8* %vla, i8** %out, align 8
+  ret void
+}
+
+; Largest frame needing two probes.
+define void @static_131072(i8** %out) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" {
+; CHECK-LABEL: static_131072:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    .cfi_def_cfa_offset 131088
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #32, lsl #12 // =131072
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %vla = alloca i8, i64 131072, align 1
+  store i8* %vla, i8** %out, align 8
+  ret void
+}
+
+; Largest frame probed without a loop (3 probes).
+define void @static_196592(i8** %out) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" {
+; CHECK-LABEL: static_196592:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #15, lsl #12 // =61440
+; CHECK-NEXT:    sub sp, sp, #4080 // =4080
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    .cfi_def_cfa_offset 196608
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #47, lsl #12 // =192512
+; CHECK-NEXT:    add sp, sp, #4080 // =4080
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %vla = alloca i8, i64 196592, align 1
+  store i8* %vla, i8** %out, align 8
+  ret void
+}
+
+; Smallest frame probed with a loop.
+define void @static_196608(i8** %out) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" {
+; CHECK-LABEL: static_196608:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    sub x9, sp, #48, lsl #12 // =196608
+; CHECK-NEXT:  .LBB9_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b.ne .LBB9_1
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    .cfi_def_cfa_offset 196624
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #48, lsl #12 // =196608
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %vla = alloca i8, i64 196608, align 1
+  store i8* %vla, i8** %out, align 8
+  ret void
+}
+
+; Large enough to use a loop, but not a miltiple of 64KiB so needs an extra
+; probe for the remainder.
+define void @static_197632(i8** %out) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" {
+; CHECK-LABEL: static_197632:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1024 // =1024
+; CHECK-NEXT:    sub x9, sp, #48, lsl #12 // =196608
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:  .LBB10_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b.ne .LBB10_1
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    .cfi_def_cfa_offset 197648
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #48, lsl #12 // =196608
+; CHECK-NEXT:    add sp, sp, #1024 // =1024
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %vla = alloca i8, i64 197632, align 1
+  store i8* %vla, i8** %out, align 8
+  ret void
+}
+
+; A small allocation, but with a very large alignment requirement. We do this
+; by moving SP far enough that a sufficiently-aligned block will exist
+; somewhere in the stack frame, so must probe the whole of that larger SP move.
+define void @static_16_align_8192(i8** %out) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" {
+; CHECK-LABEL: static_16_align_8192:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    sub x9, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    sub x9, x9, #4080 // =4080
+; CHECK-NEXT:    and x9, x9, #0xffffffffffffe000
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:  .LBB11_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.le .LBB11_3
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    // in Loop: Header=BB11_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB11_1
+; CHECK-NEXT:  .LBB11_3: // %entry
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %vla = alloca i8, i64 16, align 8192
+  store i8* %vla, i8** %out, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
@@ -0,0 +1,351 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s
+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s
+
+; Test prolog sequences for stack probing when SVE objects are involved.
+
+; An SVE stack slot needs probing, because we don't know it's size at
+; compile-time.
+define void @sve_1_vector(<vscale x 4 x float>** %out) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" {
+; CHECK-LABEL: sve_1_vector:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl x9, sp, #-1
+; CHECK-NEXT:  .LBB0_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.le .LBB0_3
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    // in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB0_1
+; CHECK-NEXT:  .LBB0_3: // %entry
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %vec = alloca <vscale x 4 x float>, align 16
+  ret void
+}
+
+; As above, but with 4 SVE vectors of stack space.
+define void @sve_4_vector(<vscale x 4 x float>** %out) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" {
+; CHECK-LABEL: sve_4_vector:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl x9, sp, #-4
+; CHECK-NEXT:  .LBB1_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.le .LBB1_3
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    // in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB1_1
+; CHECK-NEXT:  .LBB1_3: // %entry
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl sp, sp, #4
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %vec1 = alloca <vscale x 4 x float>, align 16
+  %vec2 = alloca <vscale x 4 x float>, align 16
+  %vec3 = alloca <vscale x 4 x float>, align 16
+  %vec4 = alloca <vscale x 4 x float>, align 16
+  ret void
+}
+
+; The area allocated to save callee-saved SVE registers does not need to be
+; probed, because it will always be written to, which acts as a probe.
+define void @sve_1v_csr(<vscale x 4 x float> %a) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" {
+; CHECK-LABEL: sve_1v_csr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str z8, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  call void asm sideeffect "", "~{z8}" ()
+  ret void
+}
+
+define void @sve_4v_csr(<vscale x 4 x float> %a) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" {
+; CHECK-LABEL: sve_4v_csr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-4
+; CHECK-NEXT:    str z11, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    str z10, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    ldr z11, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #4
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  call void asm sideeffect "", "~{z8},~{z9},~{z10},~{z11}" ()
+  ret void
+}
+
+define void @sve_1p_csr(<vscale x 4 x float> %a) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" {
+; CHECK-LABEL: sve_1p_csr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  call void asm sideeffect "", "~{p8}" ()
+  ret void
+}
+
+define void @sve_4p_csr(<vscale x 4 x float> %a) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" {
+; CHECK-LABEL: sve_4p_csr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p11, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p10, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p9, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    ldr p11, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  call void asm sideeffect "", "~{p8},~{p9},~{p10},~{p11}" ()
+  ret void
+}
+
+; 1 SVE vector, which needs probing, and a 16-byte fixed size object, which
+; doesn't. Here the final store of the SVE probing loop gets merged with the
+; fixed-size SP decrement, but this doesn't affect probing as the pattern of
+; memory access is the same.
+define void @sve_1_vector_16_arr(<vscale x 4 x float>** %out) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" {
+; CHECK-LABEL: sve_1_vector_16_arr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl x9, sp, #-1
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:  .LBB6_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.le .LBB6_3
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    // in Loop: Header=BB6_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB6_1
+; CHECK-NEXT:  .LBB6_3: // %entry
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    str xzr, [sp], #-16
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16 // =16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %vec = alloca <vscale x 4 x float>, align 16
+  %arr = alloca i8, i64 16, align 1
+  ret void
+}
+
+; 1 SVE stack slot and a 4096-byte stack slot, both of which need probing.
+; TODO: This could be optimised by combining the fixed-size offset into the
+; loop.
+define void @sve_1_vector_4096_arr(<vscale x 4 x float>** %out) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" {
+; CHECK-LABEL: sve_1_vector_4096_arr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl x9, sp, #-1
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:  .LBB7_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.le .LBB7_3
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    // in Loop: Header=BB7_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB7_1
+; CHECK-NEXT:  .LBB7_3: // %entry
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %vec = alloca <vscale x 4 x float>, align 16
+  %arr = alloca i8, i64 4096, align 1
+  ret void
+}
+
+; 1 SVE stack slot and a large stack slot, both of which need probing.
+; TODO this could be optimised by combining both loops.
+define void @sve_1_vector_12288_arr(<vscale x 4 x float>** %out) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" {
+; CHECK-LABEL: sve_1_vector_12288_arr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl x9, sp, #-1
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:  .LBB8_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.le .LBB8_3
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    // in Loop: Header=BB8_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB8_1
+; CHECK-NEXT:  .LBB8_3: // %entry
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    sub x9, sp, #3, lsl #12 // =12288
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:  .LBB8_4: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b.ne .LBB8_4
+; CHECK-NEXT:  // %bb.5: // %entry
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #3, lsl #12 // =12288
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %vec = alloca <vscale x 4 x float>, align 16
+  %arr = alloca i8, i64 12288, align 1
+  ret void
+}
+
+; Not tested: SVE stack objects with alignment >16 bytes, which isn't currently
+; supported even without stack-probing.
+
+; 1 SVE vector, which needs probing, and a 16-byte fixed size object, which
+; has a large alignment requirement so also needs a probing loop.
+define void @sve_1_vector_16_arr_align_8192(<vscale x 4 x float>** %out) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" {
+; CHECK-LABEL: sve_1_vector_16_arr_align_8192:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    addvl x9, sp, #-1
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:  .LBB9_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.le .LBB9_3
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    // in Loop: Header=BB9_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB9_1
+; CHECK-NEXT:  .LBB9_3: // %entry
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    sub x9, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    sub x9, x9, #4080 // =4080
+; CHECK-NEXT:    and x9, x9, #0xffffffffffffe000
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:  .LBB9_4: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.le .LBB9_6
+; CHECK-NEXT:  // %bb.5: // %entry
+; CHECK-NEXT:    // in Loop: Header=BB9_4 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB9_4
+; CHECK-NEXT:  .LBB9_6: // %entry
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %vec = alloca <vscale x 4 x float>, align 16
+  %arr = alloca i8, i64 16, align 8192
+  ret void
+}
+
+; For 64k guard pages, the only difference is the constant subtracted from SP
+; in the loop.
+define void @sve_64k_guard(<vscale x 4 x float>** %out) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" "stack-probe-size"="65536" {
+; CHECK-LABEL: sve_64k_guard:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl x9, sp, #-1
+; CHECK-NEXT:  .LBB10_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.le .LBB10_3
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    // in Loop: Header=BB10_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB10_1
+; CHECK-NEXT:  .LBB10_3: // %entry
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %vec = alloca <vscale x 4 x float>, align 16
+  ret void
+}
+
+; Not tested: dynamic allocations of SVE vectors, which don't currently work
+; without stack probing.
diff --git a/llvm/test/CodeGen/AArch64/stack-probing.ll b/llvm/test/CodeGen/AArch64/stack-probing.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/stack-probing.ll
@@ -0,0 +1,291 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s
+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -global-isel | FileCheck %s
+
+; Tests for prolog sequences for stack probing, when using a 4KiB stack guard.
+
+; Small stack frame, no probing required.
+define void @static_64(i8** %out) "probe-stack"="inline-asm" "frame-pointer"="none" {
+; CHECK-LABEL: static_64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #64 // =64
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #64 // =64
+; CHECK-NEXT:    ret
+entry:
+  %vla = alloca i8, i64 64, align 1
+  store i8* %vla, i8** %out, align 8
+  ret void
+}
+
+; At 256 bytes we start to always create a frame pointer. No frame smaller then
+; this needs a probe, so we can use the saving of at least one CSR as a probe
+; at the top of our frame.
+define void @static_256(i8** %out) "probe-stack"="inline-asm" "frame-pointer"="none" {
+; CHECK-LABEL: static_256:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #272 // =272
+; CHECK-NEXT:    str x29, [sp, #256] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 272
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #272 // =272
+; CHECK-NEXT:    ret
+entry:
+  %vla = alloca i8, i64 256, align 1
+  store i8* %vla, i8** %out, align 8
+  ret void
+}
+
+; At just less that 1024 bytes, this is the largest frame which doesn't need
+; probing.
+define void @static_1008(i8** %out) "probe-stack"="inline-asm" "frame-pointer"="none" {
+; CHECK-LABEL: static_1008:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1008 // =1008
+; CHECK-NEXT:    .cfi_def_cfa_offset 1024
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #1008 // =1008
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %vla = alloca i8, i64 1008, align 1
+  store i8* %vla, i8** %out, align 8
+  ret void
+}
+
+; At 1024 bytes, we need to start probing to guarantee that SP does not go too
+; far into the guard at an ABI boundary.
+define void @static_1024(i8** %out) "probe-stack"="inline-asm" "frame-pointer"="none" {
+; CHECK-LABEL: static_1024:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1024 // =1024
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    .cfi_def_cfa_offset 1040
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #1024 // =1024
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %vla = alloca i8, i64 1024, align 1
+  store i8* %vla, i8** %out, align 8
+  ret void
+}
+
+; 4096 bytes is the largest frame we can probe in one go.
+define void @static_4096(i8** %out) "probe-stack"="inline-asm" "frame-pointer"="none" {
+; CHECK-LABEL: static_4096:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    .cfi_def_cfa_offset 4112
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %vla = alloca i8, i64 4096, align 1
+  store i8* %vla, i8** %out, align 8
+  ret void
+}
+
+; Smallest stack frame which needs two probes (the first, smaller one gets
+; folded into a store with writeback).
+define void @static_4112(i8** %out) "probe-stack"="inline-asm" "frame-pointer"="none" {
+; CHECK-LABEL: static_4112:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    str xzr, [sp, #-16]!
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    .cfi_def_cfa_offset 4128
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    add sp, sp, #16 // =16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %vla = alloca i8, i64 4112, align 1
+  store i8* %vla, i8** %out, align 8
+  ret void
+}
+
+; Needs two probes, but neither can be folded into a store with writeback.
+define void @static_6144(i8** %out) "probe-stack"="inline-asm" "frame-pointer"="none" {
+; CHECK-LABEL: static_6144:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #2048 // =2048
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    .cfi_def_cfa_offset 6160
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    add sp, sp, #2048 // =2048
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %vla = alloca i8, i64 6144, align 1
+  store i8* %vla, i8** %out, align 8
+  ret void
+}
+
+; Largest frame needing two probes
+define void @static_8192(i8** %out) "probe-stack"="inline-asm" "frame-pointer"="none" {
+; CHECK-LABEL: static_8192:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    .cfi_def_cfa_offset 8208
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #2, lsl #12 // =8192
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %vla = alloca i8, i64 8192, align 1
+  store i8* %vla, i8** %out, align 8
+  ret void
+}
+
+; Largest frame probed without a loop
+define void @static_12272(i8** %out) "probe-stack"="inline-asm" "frame-pointer"="none" {
+; CHECK-LABEL: static_12272:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #4080 // =4080
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    .cfi_def_cfa_offset 12288
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #2, lsl #12 // =8192
+; CHECK-NEXT:    add sp, sp, #4080 // =4080
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %vla = alloca i8, i64 12272, align 1
+  store i8* %vla, i8** %out, align 8
+  ret void
+}
+
+; Smallest frame probed with a loop
+define void @static_12288(i8** %out) "probe-stack"="inline-asm" "frame-pointer"="none" {
+; CHECK-LABEL: static_12288:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    sub x9, sp, #3, lsl #12 // =12288
+; CHECK-NEXT:  .LBB9_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b.ne .LBB9_1
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    .cfi_def_cfa_offset 12304
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #3, lsl #12 // =12288
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %vla = alloca i8, i64 12288, align 1
+  store i8* %vla, i8** %out, align 8
+  ret void
+}
+
+; Large enough to use a loop, but not a multiple of 4KiB so needs an extra
+; probe for the remainder.
+define void @static_13312(i8** %out) "probe-stack"="inline-asm" "frame-pointer"="none" {
+; CHECK-LABEL: static_13312:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1024 // =1024
+; CHECK-NEXT:    sub x9, sp, #3, lsl #12 // =12288
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:  .LBB10_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b.ne .LBB10_1
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    .cfi_def_cfa_offset 13328
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #3, lsl #12 // =12288
+; CHECK-NEXT:    add sp, sp, #1024 // =1024
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %vla = alloca i8, i64 13312, align 1
+  store i8* %vla, i8** %out, align 8
+  ret void
+}
+
+; A small allocation, but with a very large alignment requirement. We do this
+; by moving SP far enough that a sufficiently-aligned block will exist
+; somewhere in the stack frame, so must probe the whole of that larger SP move.
+define void @static_16_align_8192(i8** %out) "probe-stack"="inline-asm" "frame-pointer"="none" {
+; CHECK-LABEL: static_16_align_8192:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    sub x9, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    sub x9, x9, #4080 // =4080
+; CHECK-NEXT:    and x9, x9, #0xffffffffffffe000
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:  .LBB11_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.le .LBB11_3
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    // in Loop: Header=BB11_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB11_1
+; CHECK-NEXT:  .LBB11_3: // %entry
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %vla = alloca i8, i64 16, align 8192
+  store i8* %vla, i8** %out, align 8
+  ret void
+}
+
+
+