Index: clang/docs/ReleaseNotes.rst
===================================================================
--- clang/docs/ReleaseNotes.rst
+++ clang/docs/ReleaseNotes.rst
@@ -75,8 +75,8 @@
 
 
 - -fstack-clash-protection will provide a protection against the stack clash
-  attack for x86 architecture through automatic probing of each page of
-  allocated stack.
+  attack for x86 and s390x architectures through automatic probing of each page
+  of allocated stack.
 
 - -ffp-exception-behavior={ignore,maytrap,strict} allows the user to specify
   the floating-point exception behavior.  The default setting is ``ignore``.
Index: clang/lib/Basic/Targets/SystemZ.h
===================================================================
--- clang/lib/Basic/Targets/SystemZ.h
+++ clang/lib/Basic/Targets/SystemZ.h
@@ -64,6 +64,10 @@
 
   ArrayRef<TargetInfo::AddlRegName> getGCCAddlRegNames() const override;
 
+  bool isSPRegName(StringRef RegName) const override {
+    return RegName.equals("r15");
+  }
+
   bool validateAsmConstraint(const char *&Name,
                              TargetInfo::ConstraintInfo &info) const override;
 
Index: clang/lib/Driver/ToolChains/Clang.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Clang.cpp
+++ clang/lib/Driver/ToolChains/Clang.cpp
@@ -3036,7 +3036,7 @@
   if (!EffectiveTriple.isOSLinux())
     return;
 
-  if (!EffectiveTriple.isX86())
+  if (!EffectiveTriple.isX86() && !EffectiveTriple.isSystemZ())
     return;
 
   if (Args.hasFlag(options::OPT_fstack_clash_protection,
Index: clang/test/CodeGen/stack-clash-protection.c
===================================================================
--- clang/test/CodeGen/stack-clash-protection.c
+++ clang/test/CodeGen/stack-clash-protection.c
@@ -1,5 +1,6 @@
 // Check the correct function attributes are generated
 // RUN: %clang_cc1 -triple x86_64-linux -O0 -S -emit-llvm -o- %s -fstack-clash-protection | FileCheck %s
+// RUN: %clang_cc1 -triple s390x-linux-gnu -O0 -S -emit-llvm -o- %s -fstack-clash-protection | FileCheck %s
 
 // CHECK: define void @large_stack() #[[A:.*]] {
 void large_stack() {
Index: clang/test/Driver/stack-clash-protection-02.c
===================================================================
--- /dev/null
+++ clang/test/Driver/stack-clash-protection-02.c
@@ -0,0 +1,13 @@
+// RUN: %clang -target s390x-linux-gnu -fstack-clash-protection -### %s 2>&1 | FileCheck %s -check-prefix=SystemZ
+// SystemZ: "-fstack-clash-protection"
+// RUN: %clang -target s390x-linux-gnu -fstack-clash-protection -S -emit-llvm -o %t.ll %s 2>&1 | FileCheck %s -check-prefix=SystemZ-warn
+// SystemZ-warn: warning: Unable to protect inline asm that clobbers stack pointer against stack clash
+
+int foo(int c) {
+  int r;
+  __asm__("ag %%r15, %0"
+          :
+          : "rm"(c)
+          : "r15");
+  return r;
+}
Index: llvm/include/llvm/ADT/Triple.h
===================================================================
--- llvm/include/llvm/ADT/Triple.h
+++ llvm/include/llvm/ADT/Triple.h
@@ -736,6 +736,11 @@
     return getArch() == Triple::riscv32 || getArch() == Triple::riscv64;
   }
 
+  /// Tests whether the target is SystemZ.
+  bool isSystemZ() const {
+    return getArch() == Triple::systemz;
+  }
+
   /// Tests whether the target is x86 (32- or 64-bit).
   bool isX86() const {
     return getArch() == Triple::x86 || getArch() == Triple::x86_64;
Index: llvm/lib/Target/SystemZ/SystemZFrameLowering.h
===================================================================
--- llvm/lib/Target/SystemZ/SystemZFrameLowering.h
+++ llvm/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -43,6 +43,8 @@
                                            RegScavenger *RS) const override;
   void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  void inlineStackProbe(MachineFunction &MF,
+                        MachineBasicBlock &PrologMBB) const override;
   bool hasFP(const MachineFunction &MF) const override;
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
   int getFrameIndexReference(const MachineFunction &MF, int FI,
Index: llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
===================================================================
--- llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -373,6 +373,49 @@
   }
 }
 
+// Create a new basic block after MBB.
+static MachineBasicBlock *emitBlockAfter(MachineBasicBlock *MBB) {
+  MachineFunction &MF = *MBB->getParent();
+  MachineBasicBlock *NewMBB = MF.CreateMachineBasicBlock(MBB->getBasicBlock());
+  MF.insert(std::next(MachineFunction::iterator(MBB)), NewMBB);
+  return NewMBB;
+}
+
+// Split MBB before MI and return the new block (the one that contains MI).
+static MachineBasicBlock *splitBlockBefore(MachineBasicBlock::iterator MI,
+                                           MachineBasicBlock *MBB) {
+  MachineBasicBlock *NewMBB = emitBlockAfter(MBB);
+  NewMBB->splice(NewMBB->begin(), MBB, MI, MBB->end());
+  NewMBB->transferSuccessorsAndUpdatePHIs(MBB);
+  return NewMBB;
+}
+
+// Add CFI for the new CFA offset.
+static void buildCFAOffs(MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator MBBI,
+                         const DebugLoc &DL, int Offset,
+                         const SystemZInstrInfo *ZII) {
+  unsigned CFIIndex = MBB.getParent()->addFrameInst(
+    MCCFIInstruction::createDefCfaOffset(nullptr, Offset));
+  BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION))
+    .addCFIIndex(CFIIndex);
+}
+
+// Add CFI for the new frame location.
+static void buildDefCFAReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           const DebugLoc &DL, unsigned Reg,
+                           const SystemZInstrInfo *ZII) {
+  MachineFunction &MF = *MBB.getParent();
+  MachineModuleInfo &MMI = MF.getMMI();
+  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  unsigned RegNum = MRI->getDwarfRegNum(Reg, true);
+  unsigned CFIIndex = MF.addFrameInst(
+                        MCCFIInstruction::createDefCfaRegister(nullptr, RegNum));
+  BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION))
+    .addCFIIndex(CFIIndex);
+}
+
 void SystemZFrameLowering::emitPrologue(MachineFunction &MF,
                                         MachineBasicBlock &MBB) const {
   assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
@@ -461,13 +504,23 @@
 
     // Allocate StackSize bytes.
     int64_t Delta = -int64_t(StackSize);
-    emitIncrement(MBB, MBBI, DL, SystemZ::R15D, Delta, ZII);
-
-    // Add CFI for the allocation.
-    unsigned CFIIndex = MF.addFrameInst(
-        MCCFIInstruction::createDefCfaOffset(nullptr, SPOffsetFromCFA + Delta));
-    BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION))
-        .addCFIIndex(CFIIndex);
+    if (MF.getSubtarget().getTargetLowering()->hasInlineStackProbe(MF)) {
+      // stack probing may involve looping, and control flow generations is
+      // disallowed at this point. Rely to later processing through
+      // `inlineStackProbe`.
+      MachineInstr *Stub = BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::CallBRASL))
+        .addExternalSymbol("__chkstk_stub");
+
+      // Encode the static offset as a metadata attached to the stub.
+      LLVMContext &Context = MF.getFunction().getContext();
+      MachineInstrBuilder(MF, Stub).addMetadata(
+          MDTuple::get(Context, {ConstantAsMetadata::get(ConstantInt::get(
+                                  IntegerType::get(Context, 64), StackSize))}));
+    }
+    else {
+      emitIncrement(MBB, MBBI, DL, SystemZ::R15D, Delta, ZII);
+      buildCFAOffs(MBB, MBBI, DL, SPOffsetFromCFA + Delta, ZII);
+    }
     SPOffsetFromCFA += Delta;
 
     if (StoreBackchain) {
@@ -485,11 +538,7 @@
       .addReg(SystemZ::R15D);
 
     // Add CFI for the new frame location.
-    unsigned HardFP = MRI->getDwarfRegNum(SystemZ::R11D, true);
-    unsigned CFIIndex = MF.addFrameInst(
-        MCCFIInstruction::createDefCfaRegister(nullptr, HardFP));
-    BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION))
-        .addCFIIndex(CFIIndex);
+    buildDefCFAReg(MBB, MBBI, DL, SystemZ::R11D, ZII);
 
     // Mark the FramePtr as live at the beginning of every block except
     // the entry block.  (We'll have marked R11 as live on entry when
@@ -582,6 +631,100 @@
   }
 }
 
+void SystemZFrameLowering::inlineStackProbe(MachineFunction &MF,
+                                            MachineBasicBlock &PrologMBB) const {
+  auto *ZII =
+    static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const SystemZSubtarget &STI = MF.getSubtarget<SystemZSubtarget>();
+  const SystemZTargetLowering &TLI = *STI.getTargetLowering();
+
+  const StringRef ChkStkStubSymbol = "__chkstk_stub";
+  MachineInstr *ChkStkStubMI = nullptr;
+  for (MachineInstr &MI : PrologMBB)
+    if (MI.isCall() && MI.getOperand(0).isSymbol() &&
+        ChkStkStubSymbol == MI.getOperand(0).getSymbolName()) {
+      ChkStkStubMI = &MI;
+      break;
+    }
+  if (ChkStkStubMI == nullptr)
+    return;
+  assert(ChkStkStubMI->getOperand(1).isMetadata() &&
+         "no metadata attached to that probe");
+  uint64_t StackSize =
+      cast<ConstantInt>(
+          cast<ConstantAsMetadata>(
+              cast<MDTuple>(ChkStkStubMI->getOperand(1).getMetadata())
+                  ->getOperand(0))
+              ->getValue())
+          ->getZExtValue();
+
+  unsigned StackProbeSize = TLI.getStackProbeSize(MF);
+  uint64_t NumFullBlocks = StackSize / StackProbeSize;
+  uint64_t Residual = StackSize % StackProbeSize;
+  int64_t SPOffsetFromCFA = -SystemZMC::CFAOffsetFromInitialSP;
+  MachineBasicBlock *MBB = &PrologMBB;
+  MachineBasicBlock::iterator MBBI = ChkStkStubMI;
+  const DebugLoc DL = ChkStkStubMI->getDebugLoc();
+
+  // Allocate a block of Size bytes on the stack and probe it.
+  auto allocateAndProbe = [&](MachineBasicBlock &InsMBB,
+                              MachineBasicBlock::iterator InsPt, unsigned Size,
+                              bool EmitCFI) -> void {
+    emitIncrement(InsMBB, InsPt, DL, SystemZ::R15D, -int64_t(Size), ZII);
+    if (EmitCFI) {
+      SPOffsetFromCFA -= Size;
+      buildCFAOffs(InsMBB, InsPt, DL, SPOffsetFromCFA, ZII);
+    }
+    // Probe by means of a volatile compare.
+    MachineMemOperand *MMO = MF.getMachineMemOperand(MachinePointerInfo(),
+      MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad, 8, Align(1));
+    BuildMI(InsMBB, InsPt, DL, ZII->get(SystemZ::CG)).addReg(SystemZ::R0D)
+      .addReg(SystemZ::R15D).addImm(Size - 8).addReg(0)
+      .addMemOperand(MMO);
+  };
+
+  if (NumFullBlocks < 3) {
+    // Emit unrolled probe statements.
+    for (unsigned int i = 0; i < NumFullBlocks; i++)
+      allocateAndProbe(*MBB, MBBI, StackProbeSize, true/*EmitCFI*/);
+  } else {
+    // Emit a loop probing the pages.
+    uint64_t LoopAlloc = StackProbeSize * NumFullBlocks;
+    SPOffsetFromCFA -= LoopAlloc;
+
+    BuildMI(*MBB, MBBI, DL, ZII->get(SystemZ::LGR), SystemZ::R1D)
+      .addReg(SystemZ::R15D);
+    buildDefCFAReg(*MBB, MBBI, DL, SystemZ::R1D, ZII);
+    emitIncrement(*MBB, MBBI, DL, SystemZ::R1D, -int64_t(LoopAlloc), ZII);
+    buildCFAOffs(*MBB, MBBI, DL, SystemZMC::CallFrameSize + LoopAlloc, ZII);
+
+    MachineBasicBlock *DoneMBB = splitBlockBefore(MBBI, MBB);
+    MachineBasicBlock *LoopMBB = emitBlockAfter(MBB);
+    MBB->addSuccessor(LoopMBB);
+    LoopMBB->addSuccessor(LoopMBB);
+    LoopMBB->addSuccessor(DoneMBB);
+
+    MBB = LoopMBB;
+    allocateAndProbe(*MBB, MBB->end(), StackProbeSize, false/*EmitCFI*/);
+    BuildMI(*MBB, MBB->end(), DL, ZII->get(SystemZ::CLGR))
+      .addReg(SystemZ::R15D).addReg(SystemZ::R1D);
+    BuildMI(*MBB, MBB->end(), DL, ZII->get(SystemZ::BRC))
+      .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_GT).addMBB(MBB);
+
+    MBB = DoneMBB;
+    MBBI = DoneMBB->begin();
+    buildDefCFAReg(*MBB, MBBI, DL, SystemZ::R15D, ZII);
+    if (!Residual)
+      buildCFAOffs(*MBB, MBBI, DL, SPOffsetFromCFA, ZII);
+  }
+
+  // The residual part that is less than StackProbeSize
+  if (Residual)
+    allocateAndProbe(*MBB, MBBI, Residual, true/*EmitCFI*/);
+
+  ChkStkStubMI->eraseFromParent();
+}
+
 bool SystemZFrameLowering::hasFP(const MachineFunction &MF) const {
   return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
           MF.getFrameInfo().hasVarSizedObjects() ||
Index: llvm/lib/Target/SystemZ/SystemZISelLowering.h
===================================================================
--- llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -83,6 +83,10 @@
   // base of the dynamically-allocatable area.
   ADJDYNALLOC,
 
+  // For allocating stack space when using stack clash protector.
+  // Allocation is performed by block, and each block is probed.
+  PROBED_ALLOCA,
+
   // Count number of bits set in operand 0 per byte.
   POPCNT,
 
@@ -428,6 +432,7 @@
                                   EVT VT) const override;
   bool isFPImmLegal(const APFloat &Imm, EVT VT,
                     bool ForCodeSize) const override;
+  bool hasInlineStackProbe(MachineFunction &MF) const override;
   bool isLegalICmpImmediate(int64_t Imm) const override;
   bool isLegalAddImmediate(int64_t Imm) const override;
   bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
@@ -556,6 +561,8 @@
     return true;
   }
 
+  unsigned getStackProbeSize(MachineFunction &MF) const;
+
 private:
   const SystemZSubtarget &Subtarget;
 
@@ -690,6 +697,8 @@
   MachineBasicBlock *emitLoadAndTestCmp0(MachineInstr &MI,
                                          MachineBasicBlock *MBB,
                                          unsigned Opcode) const;
+  MachineBasicBlock *emitProbedAlloca(MachineInstr &MI,
+                                      MachineBasicBlock *MBB) const;
 
   MachineMemOperand::Flags
   getTargetMMOFlags(const Instruction &I) const override;
Index: llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
===================================================================
--- llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -824,6 +824,15 @@
   return SystemZVectorConstantInfo(Imm).isVectorConstantLegal(Subtarget);
 }
 
+/// Returns true if stack probing through inline assembly is requested.
+bool SystemZTargetLowering::hasInlineStackProbe(MachineFunction &MF) const {
+  // If the function specifically requests inline stack probes, emit them.
+  if (MF.getFunction().hasFnAttribute("probe-stack"))
+    return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
+           "inline-asm";
+  return false;
+}
+
 bool SystemZTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
   // We can use CGFI or CLGFI.
   return isInt<32>(Imm) || isUInt<32>(Imm);
@@ -3426,10 +3435,17 @@
                               DAG.getConstant(ExtraAlignSpace, DL, MVT::i64));
 
   // Get the new stack pointer value.
-  SDValue NewSP = DAG.getNode(ISD::SUB, DL, MVT::i64, OldSP, NeededSpace);
-
-  // Copy the new stack pointer back.
-  Chain = DAG.getCopyToReg(Chain, DL, SPReg, NewSP);
+  SDValue NewSP;
+  if (hasInlineStackProbe(MF)) {
+    NewSP = DAG.getNode(SystemZISD::PROBED_ALLOCA, DL,
+                DAG.getVTList(MVT::i64, MVT::Other), Chain, OldSP, NeededSpace);
+    Chain = NewSP.getValue(1);
+  }
+  else {
+    NewSP = DAG.getNode(ISD::SUB, DL, MVT::i64, OldSP, NeededSpace);
+    // Copy the new stack pointer back.
+    Chain = DAG.getCopyToReg(Chain, DL, SPReg, NewSP);
+  }
 
   // The allocated data lives above the 160 bytes allocated for the standard
   // frame, plus any outgoing stack arguments.  We don't know how much that
@@ -5343,6 +5359,7 @@
     OPCODE(BR_CCMASK);
     OPCODE(SELECT_CCMASK);
     OPCODE(ADJDYNALLOC);
+    OPCODE(PROBED_ALLOCA);
     OPCODE(POPCNT);
     OPCODE(SMUL_LOHI);
     OPCODE(UMUL_LOHI);
@@ -6738,6 +6755,19 @@
   return 1;
 }
 
+unsigned
+SystemZTargetLowering::getStackProbeSize(MachineFunction &MF) const {
+  // The default stack probe size is 4096 if the function has no stackprobesize
+  // attribute.
+  unsigned StackProbeSize = 4096;
+  const Function &Fn = MF.getFunction();
+  if (Fn.hasFnAttribute("stack-probe-size"))
+    Fn.getFnAttribute("stack-probe-size")
+        .getValueAsString()
+        .getAsInteger(0, StackProbeSize);
+  return StackProbeSize;
+}
+
 //===----------------------------------------------------------------------===//
 // Custom insertion
 //===----------------------------------------------------------------------===//
@@ -7803,6 +7833,73 @@
   return MBB;
 }
 
+MachineBasicBlock *SystemZTargetLowering::emitProbedAlloca(
+    MachineInstr &MI, MachineBasicBlock *MBB) const {
+  MachineFunction &MF = *MBB->getParent();
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+  DebugLoc DL = MI.getDebugLoc();
+  const unsigned ProbeSize = getStackProbeSize(MF);
+  unsigned SizeReg = MI.getOperand(2).getReg();
+
+  MachineBasicBlock *StartMBB = MBB;
+  MachineBasicBlock *TailMBB  = splitBlockAfter(MI, MBB);
+  MachineBasicBlock *TestMBB  = emitBlockAfter(StartMBB);
+  MachineBasicBlock *BlockMBB = emitBlockAfter(TestMBB);
+
+  unsigned TmpSizeReg = MRI->createVirtualRegister(&SystemZ::GR64BitRegClass);
+  unsigned TmpSizeReg2 = MRI->createVirtualRegister(&SystemZ::GR64BitRegClass);
+
+  // TestMBB
+  BuildMI(TestMBB, DL, TII->get(SystemZ::PHI), TmpSizeReg)
+    .addReg(SizeReg)
+    .addMBB(MBB)
+    .addReg(TmpSizeReg2)
+    .addMBB(BlockMBB);
+
+  BuildMI(TestMBB, DL, TII->get(SystemZ::CLGFI))
+    .addReg(TmpSizeReg)
+    .addImm(ProbeSize);
+
+  BuildMI(TestMBB, DL, TII->get(SystemZ::BRC))
+    .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_LT).addMBB(TailMBB);
+  StartMBB->addSuccessor(TestMBB);
+  TestMBB->addSuccessor(BlockMBB);
+  TestMBB->addSuccessor(TailMBB);
+
+  // BlockMBB: Allocate and probe by means of a volatile compare.
+  BuildMI(BlockMBB, DL, TII->get(SystemZ::SLGFI), TmpSizeReg2)
+    .addReg(TmpSizeReg)
+    .addImm(ProbeSize);
+
+  BuildMI(BlockMBB, DL, TII->get(SystemZ::SLGFI), SystemZ::R15D)
+    .addReg(SystemZ::R15D)
+    .addImm(ProbeSize);
+
+  MachineMemOperand *MMO = MF.getMachineMemOperand(MachinePointerInfo(),
+    MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad, 8, Align(1));
+  BuildMI(BlockMBB, DL, TII->get(SystemZ::CG)).addReg(SystemZ::R15D)
+    .addReg(SystemZ::R15D).addImm(ProbeSize - 8).addReg(0)
+    .addMemOperand(MMO);
+
+  BuildMI(BlockMBB, DL, TII->get(SystemZ::J)).addMBB(TestMBB);
+  BlockMBB->addSuccessor(TestMBB);
+
+  // TailMBB
+  MachineBasicBlock::iterator InsPos = TailMBB->begin();
+  BuildMI(*TailMBB, InsPos, DL, TII->get(SystemZ::SLGR), SystemZ::R15D)
+    .addReg(SystemZ::R15D)
+    .addReg(TmpSizeReg);
+
+  BuildMI(*TailMBB, InsPos, DL, TII->get(TargetOpcode::COPY),
+          MI.getOperand(0).getReg())
+    .addReg(SystemZ::R15D);
+
+  MI.eraseFromParent();
+  return TailMBB;
+}
+
 MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
     MachineInstr &MI, MachineBasicBlock *MBB) const {
   switch (MI.getOpcode()) {
@@ -8063,6 +8160,9 @@
   case SystemZ::LTXBRCompare_VecPseudo:
     return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTXBR);
 
+  case SystemZ::PROBED_ALLOCA:
+    return emitProbedAlloca(MI, MBB);
+
   case TargetOpcode::STACKMAP:
   case TargetOpcode::PATCHPOINT:
     return emitPatchPoint(MI, MBB);
Index: llvm/lib/Target/SystemZ/SystemZInstrInfo.td
===================================================================
--- llvm/lib/Target/SystemZ/SystemZInstrInfo.td
+++ llvm/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -29,6 +29,11 @@
 def ADJDYNALLOC : Pseudo<(outs GR64:$dst), (ins dynalloc12only:$src),
                          [(set GR64:$dst, dynalloc12only:$src)]>;
 
+let Defs = [R15D, CC], Uses = [R15D], hasNoSchedulingInfo = 1,
+    usesCustomInserter = 1 in
+  def PROBED_ALLOCA : Pseudo<(outs GR64:$dst),
+                             (ins GR64:$oldSP, GR64:$space),
+           [(set GR64:$dst, (z_probed_alloca GR64:$oldSP, GR64:$space))]>;
 
 //===----------------------------------------------------------------------===//
 // Branch instructions
Index: llvm/lib/Target/SystemZ/SystemZOperators.td
===================================================================
--- llvm/lib/Target/SystemZ/SystemZOperators.td
+++ llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -40,6 +40,10 @@
                                              SDTCisSameAs<0, 2>,
                                              SDTCisPtrTy<0>]>;
 def SDT_ZAdjDynAlloc        : SDTypeProfile<1, 0, [SDTCisVT<0, i64>]>;
+def SDT_ZProbedAlloca       : SDTypeProfile<1, 2,
+                                            [SDTCisSameAs<0, 1>,
+                                             SDTCisSameAs<0, 2>,
+                                             SDTCisPtrTy<0>]>;
 def SDT_ZGR128Binary        : SDTypeProfile<1, 2,
                                             [SDTCisVT<0, untyped>,
                                              SDTCisInt<1>,
@@ -269,6 +273,8 @@
                                  SDT_ZSelectCCMask>;
 def z_ipm_1             : SDNode<"SystemZISD::IPM", SDT_ZIPM>;
 def z_adjdynalloc       : SDNode<"SystemZISD::ADJDYNALLOC", SDT_ZAdjDynAlloc>;
+def z_probed_alloca     : SDNode<"SystemZISD::PROBED_ALLOCA", SDT_ZProbedAlloca,
+                                 [SDNPHasChain]>;
 def z_popcnt            : SDNode<"SystemZISD::POPCNT", SDTIntUnaryOp>;
 def z_smul_lohi         : SDNode<"SystemZISD::SMUL_LOHI", SDT_ZGR128Binary>;
 def z_umul_lohi         : SDNode<"SystemZISD::UMUL_LOHI", SDT_ZGR128Binary>;
Index: llvm/test/CodeGen/SystemZ/stack-clash-dynamic-alloca.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/SystemZ/stack-clash-dynamic-alloca.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+define i32 @foo(i32 %n) #0 {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    stmg %r11, %r15, 88(%r15)
+; CHECK-NEXT:    .cfi_offset %r11, -72
+; CHECK-NEXT:    .cfi_offset %r15, -40
+; CHECK-NEXT:    aghi %r15, -160
+; CHECK-NEXT:    .cfi_def_cfa_offset 320
+; CHECK-NEXT:    cg %r0, 152(%r15)
+; CHECK-NEXT:    lgr %r11, %r15
+; CHECK-NEXT:    .cfi_def_cfa_register %r11
+; CHECK-NEXT:    # kill: def $r2l killed $r2l def $r2d
+; CHECK-NEXT:    risbgn %r1, %r2, 30, 189, 2
+; CHECK-NEXT:    la %r0, 7(%r1)
+; CHECK-NEXT:    risbgn %r1, %r0, 29, 188, 0
+; CHECK-NEXT:    la %r1, 8(%r1)
+; CHECK-NEXT:    clgfi %r1, 4096
+; CHECK-NEXT:    jl .LBB0_2
+; CHECK-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    slgfi %r1, 4096
+; CHECK-NEXT:    slgfi %r15, 4096
+; CHECK-NEXT:    cg %r15, 4088(%r15)
+; CHECK-NEXT:    clgfi %r1, 4096
+; CHECK-NEXT:    jhe .LBB0_1
+; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:    slgr %r15, %r1
+; CHECK-NEXT:    la %r1, 168(%r15)
+; CHECK-NEXT:    nill %r1, 65520
+; CHECK-NEXT:    lhi %r0, 1
+; CHECK-NEXT:    sty %r0, 4792(%r1)
+; CHECK-NEXT:    l %r2, 0(%r1)
+; CHECK-NEXT:    lmg %r11, %r15, 248(%r11)
+; CHECK-NEXT:    br %r14
+
+  %a = alloca i32, i32 %n, align 16
+  %b = getelementptr inbounds i32, i32* %a, i64 1198
+  store volatile i32 1, i32* %b
+  %c = load volatile i32, i32* %a
+  ret i32 %c
+}
+
+attributes #0 =  {"probe-stack"="inline-asm"}
Index: llvm/test/CodeGen/SystemZ/stack-clash-large.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/SystemZ/stack-clash-large.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+define i32 @foo() #0 {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    stmg %r11, %r15, 88(%r15)
+; CHECK-NEXT:    .cfi_offset %r11, -72
+; CHECK-NEXT:    .cfi_offset %r15, -40
+; CHECK-NEXT:    aghi %r15, -160
+; CHECK-NEXT:    .cfi_def_cfa_offset 320
+; CHECK-NEXT:    cg %r0, 152(%r15)
+; CHECK-NEXT:    lgr %r11, %r15
+; CHECK-NEXT:    .cfi_def_cfa_register %r11
+; CHECK-NEXT:    lgfi %r1, 72008
+; CHECK-NEXT:    clgfi %r1, 4096
+; CHECK-NEXT:    jl .LBB0_2
+; CHECK-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    slgfi %r1, 4096
+; CHECK-NEXT:    slgfi %r15, 4096
+; CHECK-NEXT:    cg %r15, 4088(%r15)
+; CHECK-NEXT:    clgfi %r1, 4096
+; CHECK-NEXT:    jhe .LBB0_1
+; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:    slgr %r15, %r1
+; CHECK-NEXT:    la %r1, 168(%r15)
+; CHECK-NEXT:    nill %r1, 65520
+; CHECK-NEXT:    lhi %r0, 1
+; CHECK-NEXT:    mvhi 392(%r1), 1
+; CHECK-NEXT:    sty %r0, 28792(%r1)
+; CHECK-NEXT:    l %r2, 0(%r1)
+; CHECK-NEXT:    lmg %r11, %r15, 248(%r11)
+; CHECK-NEXT:    br %r14
+
+
+  %a = alloca i32, i64 18000, align 16
+  %b0 = getelementptr inbounds i32, i32* %a, i64 98
+  %b1 = getelementptr inbounds i32, i32* %a, i64 7198
+  store volatile i32 1, i32* %b0
+  store volatile i32 1, i32* %b1
+  %c = load volatile i32, i32* %a
+  ret i32 %c
+}
+
+attributes #0 =  {"probe-stack"="inline-asm"}
Index: llvm/test/CodeGen/SystemZ/stack-clash-medium-natural-probes-mutliple-objects.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/SystemZ/stack-clash-medium-natural-probes-mutliple-objects.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+define i32 @foo() #0 {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    stmg %r11, %r15, 88(%r15)
+; CHECK-NEXT:    .cfi_offset %r11, -72
+; CHECK-NEXT:    .cfi_offset %r15, -40
+; CHECK-NEXT:    aghi %r15, -160
+; CHECK-NEXT:    .cfi_def_cfa_offset 320
+; CHECK-NEXT:    cg %r0, 152(%r15)
+; CHECK-NEXT:    lgr %r11, %r15
+; CHECK-NEXT:    .cfi_def_cfa_register %r11
+; CHECK-NEXT:    lghi %r1, 4008
+; CHECK-NEXT:    clgfi %r1, 4096
+; CHECK-NEXT:    jl .LBB0_2
+; CHECK-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    slgfi %r1, 4096
+; CHECK-NEXT:    slgfi %r15, 4096
+; CHECK-NEXT:    cg %r15, 4088(%r15)
+; CHECK-NEXT:    clgfi %r1, 4096
+; CHECK-NEXT:    jhe .LBB0_1
+; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:    slgr %r15, %r1
+; CHECK-NEXT:    la %r1, 168(%r15)
+; CHECK-NEXT:    lghi %r2, 2008
+; CHECK-NEXT:    nill %r1, 65520
+; CHECK-NEXT:    clgfi %r2, 4096
+; CHECK-NEXT:    jl .LBB0_4
+; CHECK-NEXT:  .LBB0_3: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    slgfi %r2, 4096
+; CHECK-NEXT:    slgfi %r15, 4096
+; CHECK-NEXT:    cg %r15, 4088(%r15)
+; CHECK-NEXT:    clgfi %r2, 4096
+; CHECK-NEXT:    jhe .LBB0_3
+; CHECK-NEXT:  .LBB0_4:
+; CHECK-NEXT:    slgr %r15, %r2
+; CHECK-NEXT:    la %r2, 168(%r15)
+; CHECK-NEXT:    nill %r2, 65520
+; CHECK-NEXT:    mvhi 2000(%r1), 1
+; CHECK-NEXT:    mvhi 800(%r2), 2
+; CHECK-NEXT:    l %r2, 0(%r1)
+; CHECK-NEXT:    lmg %r11, %r15, 248(%r11)
+; CHECK-NEXT:    br %r14
+
+  %a = alloca i32, i64 1000, align 16
+  %b = alloca i32, i64 500, align 16
+  %a0 = getelementptr inbounds i32, i32* %a, i64 500
+  %b0 = getelementptr inbounds i32, i32* %b, i64 200
+  store volatile i32 1, i32* %a0
+  store volatile i32 2, i32* %b0
+  %c = load volatile i32, i32* %a
+  ret i32 %c
+}
+
+attributes #0 =  {"probe-stack"="inline-asm"}
Index: llvm/test/CodeGen/SystemZ/stack-clash-medium-natural-probes.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/SystemZ/stack-clash-medium-natural-probes.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+define i32 @foo() #0 {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    stmg %r11, %r15, 88(%r15)
+; CHECK-NEXT:    .cfi_offset %r11, -72
+; CHECK-NEXT:    .cfi_offset %r15, -40
+; CHECK-NEXT:    aghi %r15, -160
+; CHECK-NEXT:    .cfi_def_cfa_offset 320
+; CHECK-NEXT:    cg %r0, 152(%r15)
+; CHECK-NEXT:    lgr %r11, %r15
+; CHECK-NEXT:    .cfi_def_cfa_register %r11
+; CHECK-NEXT:    lghi %r1, 8008
+; CHECK-NEXT:    clgfi %r1, 4096
+; CHECK-NEXT:    jl .LBB0_2
+; CHECK-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    slgfi %r1, 4096
+; CHECK-NEXT:    slgfi %r15, 4096
+; CHECK-NEXT:    cg %r15, 4088(%r15)
+; CHECK-NEXT:    clgfi %r1, 4096
+; CHECK-NEXT:    jhe .LBB0_1
+; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:    slgr %r15, %r1
+; CHECK-NEXT:    la %r1, 168(%r15)
+; CHECK-NEXT:    nill %r1, 65520
+; CHECK-NEXT:    lhi %r0, 1
+; CHECK-NEXT:    mvhi 392(%r1), 1
+; CHECK-NEXT:    sty %r0, 4792(%r1)
+; CHECK-NEXT:    l %r2, 0(%r1)
+; CHECK-NEXT:    lmg %r11, %r15, 248(%r11)
+; CHECK-NEXT:    br %r14
+
+  %a = alloca i32, i64 2000, align 16
+  %b0 = getelementptr inbounds i32, i32* %a, i64 98
+  %b1 = getelementptr inbounds i32, i32* %a, i64 1198
+  store i32 1, i32* %b0
+  store i32 1, i32* %b1
+  %c = load volatile i32, i32* %a
+  ret i32 %c
+}
+
+attributes #0 =  {"probe-stack"="inline-asm"}
Index: llvm/test/CodeGen/SystemZ/stack-clash-medium.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/SystemZ/stack-clash-medium.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+define i32 @foo() #0 {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    stmg %r11, %r15, 88(%r15)
+; CHECK-NEXT:    .cfi_offset %r11, -72
+; CHECK-NEXT:    .cfi_offset %r15, -40
+; CHECK-NEXT:    aghi %r15, -160
+; CHECK-NEXT:    .cfi_def_cfa_offset 320
+; CHECK-NEXT:    cg %r0, 152(%r15)
+; CHECK-NEXT:    lgr %r11, %r15
+; CHECK-NEXT:    .cfi_def_cfa_register %r11
+; CHECK-NEXT:    lghi %r1, 8008
+; CHECK-NEXT:    clgfi %r1, 4096
+; CHECK-NEXT:    jl .LBB0_2
+; CHECK-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    slgfi %r1, 4096
+; CHECK-NEXT:    slgfi %r15, 4096
+; CHECK-NEXT:    cg %r15, 4088(%r15)
+; CHECK-NEXT:    clgfi %r1, 4096
+; CHECK-NEXT:    jhe .LBB0_1
+; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:    slgr %r15, %r1
+; CHECK-NEXT:    la %r1, 168(%r15)
+; CHECK-NEXT:    nill %r1, 65520
+; CHECK-NEXT:    mvhi 800(%r1), 1
+; CHECK-NEXT:    l %r2, 0(%r1)
+; CHECK-NEXT:    lmg %r11, %r15, 248(%r11)
+; CHECK-NEXT:    br %r14
+
+  %a = alloca i32, i64 2000, align 16
+  %b = getelementptr inbounds i32, i32* %a, i64 200
+  store volatile i32 1, i32* %b
+  %c = load volatile i32, i32* %a
+  ret i32 %c
+}
+
+attributes #0 =  {"probe-stack"="inline-asm"}
Index: llvm/test/CodeGen/SystemZ/stack-clash-no-free-probe.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/SystemZ/stack-clash-no-free-probe.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+define i32 @foo(i64 %i) #0 {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    stmg %r11, %r15, 88(%r15)
+; CHECK-NEXT:    .cfi_offset %r11, -72
+; CHECK-NEXT:    .cfi_offset %r15, -40
+; CHECK-NEXT:    aghi %r15, -160
+; CHECK-NEXT:    .cfi_def_cfa_offset 320
+; CHECK-NEXT:    cg %r0, 152(%r15)
+; CHECK-NEXT:    lgr %r11, %r15
+; CHECK-NEXT:    .cfi_def_cfa_register %r11
+; CHECK-NEXT:    lghi %r1, 8008
+; CHECK-NEXT:    clgfi %r1, 4096
+; CHECK-NEXT:    jl .LBB0_2
+; CHECK-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    slgfi %r1, 4096
+; CHECK-NEXT:    slgfi %r15, 4096
+; CHECK-NEXT:    cg %r15, 4088(%r15)
+; CHECK-NEXT:    clgfi %r1, 4096
+; CHECK-NEXT:    jhe .LBB0_1
+; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:    slgr %r15, %r1
+; CHECK-NEXT:    la %r1, 168(%r15)
+; CHECK-NEXT:    nill %r1, 65520
+; CHECK-NEXT:    sllg %r2, %r2, 2
+; CHECK-NEXT:    lhi %r0, 1
+; CHECK-NEXT:    st %r0, 0(%r2,%r1)
+; CHECK-NEXT:    l %r2, 0(%r1)
+; CHECK-NEXT:    lmg %r11, %r15, 248(%r11)
+; CHECK-NEXT:    br %r14
+
+  %a = alloca i32, i32 2000, align 16
+  %b = getelementptr inbounds i32, i32* %a, i64 %i
+  store volatile i32 1, i32* %b
+  %c = load volatile i32, i32* %a
+  ret i32 %c
+}
+
+attributes #0 =  {"probe-stack"="inline-asm"}
+
Index: llvm/test/CodeGen/SystemZ/stack-clash-protection.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/SystemZ/stack-clash-protection.ll
@@ -0,0 +1,123 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 -O3 | FileCheck %s
+
+; Small enough to avoid loop.
+define void @fun0() #0 {
+; CHECK-LABEL: fun0:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    aghi %r15, -4096
+; CHECK-NEXT:    .cfi_def_cfa_offset 4256
+; CHECK-NEXT:    cg %r0, 4088(%r15)
+; CHECK-NEXT:    aghi %r15, -88
+; CHECK-NEXT:    .cfi_def_cfa_offset 4344
+; CHECK-NEXT:    cg %r0, 80(%r15)
+; CHECK-NEXT:    mvhi 180(%r15), 0
+; CHECK-NEXT:    l %r0, 180(%r15)
+; CHECK-NEXT:    aghi %r15, 4184
+; CHECK-NEXT:    br %r14
+entry:
+  %stack = alloca [1000 x i32], align 4
+  %i = alloca i32, align 4
+  %0 = bitcast [1000 x i32]* %stack to i8*
+  %i.0.i.0..sroa_cast = bitcast i32* %i to i8*
+  store volatile i32 0, i32* %i, align 4
+  %i.0.i.0.6 = load volatile i32, i32* %i, align 4
+  ret void
+}
+
+; Uses a loop to allocate and probe in steps.
+define void @fun1() #0 {
+; CHECK-LABEL: fun1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lgr %r1, %r15
+; CHECK-NEXT:    .cfi_def_cfa_register %r1
+; CHECK-NEXT:    aghi %r1, -28672
+; CHECK-NEXT:    .cfi_def_cfa_offset -28832
+; CHECK-NEXT:  .LBB1_1: # %entry
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    aghi %r15, -4096
+; CHECK-NEXT:    cg %r0, 4088(%r15)
+; CHECK-NEXT:    clgrjh %r15, %r1, .LBB1_1
+; CHECK-NEXT:  # %bb.2: # %entry
+; CHECK-NEXT:    .cfi_def_cfa_register %r15
+; CHECK-NEXT:    aghi %r15, -3512
+; CHECK-NEXT:    .cfi_def_cfa_offset 32344
+; CHECK-NEXT:    cg %r0, 3504(%r15)
+; CHECK-NEXT:    mvhi 180(%r15), 0
+; CHECK-NEXT:    l %r0, 180(%r15)
+; CHECK-NEXT:    aghi %r15, 32184
+; CHECK-NEXT:    br %r14
+entry:
+  %stack = alloca [8000 x i32], align 4
+  %i = alloca i32, align 4
+  %0 = bitcast [8000 x i32]* %stack to i8*
+  %i.0.i.0..sroa_cast = bitcast i32* %i to i8*
+  store volatile i32 0, i32* %i, align 4
+  %i.0.i.0.6 = load volatile i32, i32* %i, align 4
+  ret void
+}
+
+; Loop with bigger step.
+define void @fun2() #0 "stack-probe-size"="8192" {
+; CHECK-LABEL: fun2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lgr %r1, %r15
+; CHECK-NEXT:    .cfi_def_cfa_register %r1
+; CHECK-NEXT:    aghi %r1, -24576
+; CHECK-NEXT:    .cfi_def_cfa_offset -24736
+; CHECK-NEXT:  .LBB2_1: # %entry
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    aghi %r15, -8192
+; CHECK-NEXT:    cg %r0, 8184(%r15)
+; CHECK-NEXT:    clgrjh %r15, %r1, .LBB2_1
+; CHECK-NEXT:  # %bb.2: # %entry
+; CHECK-NEXT:    .cfi_def_cfa_register %r15
+; CHECK-NEXT:    aghi %r15, -7608
+; CHECK-NEXT:    .cfi_def_cfa_offset 32344
+; CHECK-NEXT:    cg %r0, 7600(%r15)
+; CHECK-NEXT:    mvhi 180(%r15), 0
+; CHECK-NEXT:    l %r0, 180(%r15)
+; CHECK-NEXT:    aghi %r15, 32184
+; CHECK-NEXT:    br %r14
+entry:
+  %stack = alloca [8000 x i32], align 4
+  %i = alloca i32, align 4
+  %0 = bitcast [8000 x i32]* %stack to i8*
+  %i.0.i.0..sroa_cast = bitcast i32* %i to i8*
+  store volatile i32 0, i32* %i, align 4
+  %i.0.i.0.6 = load volatile i32, i32* %i, align 4
+  ret void
+}
+
+; Ends evenly on the step so no remainder needed.
+define void @fun3() #0 {
+; CHECK-LABEL: fun3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lgr %r1, %r15
+; CHECK-NEXT:    .cfi_def_cfa_register %r1
+; CHECK-NEXT:    aghi %r1, -28672
+; CHECK-NEXT:    .cfi_def_cfa_offset -28832
+; CHECK-NEXT:  .LBB3_1: # %entry
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    aghi %r15, -4096
+; CHECK-NEXT:    cg %r0, 4088(%r15)
+; CHECK-NEXT:    clgrjh %r15, %r1, .LBB3_1
+; CHECK-NEXT:  # %bb.2: # %entry
+; CHECK-NEXT:    .cfi_def_cfa_register %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 28832
+; CHECK-NEXT:    mvhi 180(%r15), 0
+; CHECK-NEXT:    l %r0, 180(%r15)
+; CHECK-NEXT:    aghi %r15, 28672
+; CHECK-NEXT:    br %r14
+entry:
+  %stack = alloca [7122 x i32], align 4
+  %i = alloca i32, align 4
+  %0 = bitcast [7122 x i32]* %stack to i8*
+  %i.0.i.0..sroa_cast = bitcast i32* %i to i8*
+  store volatile i32 0, i32* %i, align 4
+  %i.0.i.0.6 = load volatile i32, i32* %i, align 4
+  ret void
+}
+
+attributes #0 = {  "probe-stack"="inline-asm"  }
+
Index: llvm/test/CodeGen/SystemZ/stack-clash-small.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/SystemZ/stack-clash-small.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+define i32 @foo() #0 {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    stmg %r11, %r15, 88(%r15)
+; CHECK-NEXT:    .cfi_offset %r11, -72
+; CHECK-NEXT:    .cfi_offset %r15, -40
+; CHECK-NEXT:    aghi %r15, -160
+; CHECK-NEXT:    .cfi_def_cfa_offset 320
+; CHECK-NEXT:    cg %r0, 152(%r15)
+; CHECK-NEXT:    lgr %r11, %r15
+; CHECK-NEXT:    .cfi_def_cfa_register %r11
+; CHECK-NEXT:    lghi %r1, 408
+; CHECK-NEXT:    clgfi %r1, 4096
+; CHECK-NEXT:    jl .LBB0_2
+; CHECK-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    slgfi %r1, 4096
+; CHECK-NEXT:    slgfi %r15, 4096
+; CHECK-NEXT:    cg %r15, 4088(%r15)
+; CHECK-NEXT:    clgfi %r1, 4096
+; CHECK-NEXT:    jhe .LBB0_1
+; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:    slgr %r15, %r1
+; CHECK-NEXT:    la %r1, 168(%r15)
+; CHECK-NEXT:    nill %r1, 65520
+; CHECK-NEXT:    mvhi 392(%r1), 1
+; CHECK-NEXT:    l %r2, 0(%r1)
+; CHECK-NEXT:    lmg %r11, %r15, 248(%r11)
+; CHECK-NEXT:    br %r14
+
+  %a = alloca i32, i64 100, align 16
+  %b = getelementptr inbounds i32, i32* %a, i64 98
+  store volatile i32 1, i32* %b
+  %c = load volatile i32, i32* %a
+  ret i32 %c
+}
+
+attributes #0 =  {"probe-stack"="inline-asm"}
Index: llvm/test/CodeGen/SystemZ/stack-clash-unknown-call.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/SystemZ/stack-clash-unknown-call.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg);
+
+define void @foo() #0 {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    stmg %r11, %r15, 88(%r15)
+; CHECK-NEXT:    .cfi_offset %r11, -72
+; CHECK-NEXT:    .cfi_offset %r15, -40
+; CHECK-NEXT:    aghi %r15, -160
+; CHECK-NEXT:    .cfi_def_cfa_offset 320
+; CHECK-NEXT:    cg %r0, 152(%r15)
+; CHECK-NEXT:    lgr %r11, %r15
+; CHECK-NEXT:    .cfi_def_cfa_register %r11
+; CHECK-NEXT:    lghi %r1, 8008
+; CHECK-NEXT:    clgfi %r1, 4096
+; CHECK-NEXT:    jl .LBB0_2
+; CHECK-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    slgfi %r1, 4096
+; CHECK-NEXT:    slgfi %r15, 4096
+; CHECK-NEXT:    cg %r15, 4088(%r15)
+; CHECK-NEXT:    clgfi %r1, 4096
+; CHECK-NEXT:    jhe .LBB0_1
+; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:    slgr %r15, %r1
+; CHECK-NEXT:    la %r1, 168(%r15)
+; CHECK-NEXT:    nill %r1, 65520
+; CHECK-NEXT:    lghi %r0, 31
+; CHECK-NEXT:  .LBB0_3: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    xc 0(256,%r1), 0(%r1)
+; CHECK-NEXT:    la %r1, 256(%r1)
+; CHECK-NEXT:    brctg %r0, .LBB0_3
+; CHECK-NEXT:  # %bb.4:
+; CHECK-NEXT:    xc 0(64,%r1), 0(%r1)
+; CHECK-NEXT:    lmg %r11, %r15, 248(%r11)
+; CHECK-NEXT:    br %r14
+; it's important that we don't use the call as a probe here
+
+  %a = alloca i8, i64 8000, align 16
+  call void @llvm.memset.p0i8.i64(i8* align 16 %a, i8 0, i64 8000, i1 false)
+  ret void
+}
+
+attributes #0 =  {"probe-stack"="inline-asm"}