diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -94,8 +94,8 @@ ------------------ - -fstack-clash-protection will provide a protection against the stack clash - attack for x86 architecture through automatic probing of each page of - allocated stack. + attack for x86 and s390x architectures through automatic probing of each page + of allocated stack. - -ffp-exception-behavior={ignore,maytrap,strict} allows the user to specify the floating-point exception behavior. The default setting is ``ignore``. diff --git a/clang/lib/Basic/Targets/SystemZ.h b/clang/lib/Basic/Targets/SystemZ.h --- a/clang/lib/Basic/Targets/SystemZ.h +++ b/clang/lib/Basic/Targets/SystemZ.h @@ -64,6 +64,10 @@ ArrayRef getGCCAddlRegNames() const override; + bool isSPRegName(StringRef RegName) const override { + return RegName.equals("r15"); + } + bool validateAsmConstraint(const char *&Name, TargetInfo::ConstraintInfo &info) const override; diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -2997,7 +2997,7 @@ if (!EffectiveTriple.isOSLinux()) return; - if (!EffectiveTriple.isX86()) + if (!EffectiveTriple.isX86() && !EffectiveTriple.isSystemZ()) return; if (Args.hasFlag(options::OPT_fstack_clash_protection, diff --git a/clang/test/CodeGen/stack-clash-protection.c b/clang/test/CodeGen/stack-clash-protection.c --- a/clang/test/CodeGen/stack-clash-protection.c +++ b/clang/test/CodeGen/stack-clash-protection.c @@ -1,5 +1,6 @@ // Check the correct function attributes are generated // RUN: %clang_cc1 -triple x86_64-linux -O0 -S -emit-llvm -o- %s -fstack-clash-protection | FileCheck %s +// RUN: %clang_cc1 -triple s390x-linux-gnu -O0 -S -emit-llvm -o- %s -fstack-clash-protection | FileCheck %s // CHECK: define void @large_stack() #[[A:.*]] { void large_stack() { diff --git a/clang/test/Driver/stack-clash-protection-02.c b/clang/test/Driver/stack-clash-protection-02.c new file mode 100644 --- /dev/null +++ b/clang/test/Driver/stack-clash-protection-02.c @@ -0,0 +1,13 @@ +// RUN: %clang -target s390x-linux-gnu -fstack-clash-protection -### %s 2>&1 | FileCheck %s -check-prefix=SystemZ +// SystemZ: "-fstack-clash-protection" +// RUN: %clang -target s390x-linux-gnu -fstack-clash-protection -S -emit-llvm -o %t.ll %s 2>&1 | FileCheck %s -check-prefix=SystemZ-warn +// SystemZ-warn: warning: Unable to protect inline asm that clobbers stack pointer against stack clash + +int foo(int c) { + int r; + __asm__("ag %%r15, %0" + : + : "rm"(c) + : "r15"); + return r; +} diff --git a/llvm/include/llvm/ADT/Triple.h b/llvm/include/llvm/ADT/Triple.h --- a/llvm/include/llvm/ADT/Triple.h +++ b/llvm/include/llvm/ADT/Triple.h @@ -739,6 +739,11 @@ return getArch() == Triple::riscv32 || getArch() == Triple::riscv64; } + /// Tests whether the target is SystemZ. + bool isSystemZ() const { + return getArch() == Triple::systemz; + } + /// Tests whether the target is x86 (32- or 64-bit). bool isX86() const { return getArch() == Triple::x86 || getArch() == Triple::x86_64; diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h --- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h +++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h @@ -43,6 +43,8 @@ RegScavenger *RS) const override; void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + void inlineStackProbe(MachineFunction &MF, + MachineBasicBlock &PrologMBB) const override; bool hasFP(const MachineFunction &MF) const override; bool hasReservedCallFrame(const MachineFunction &MF) const override; int getFrameIndexReference(const MachineFunction &MF, int FI, diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp --- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp @@ -374,12 +374,39 @@ } } +// Add CFI for the new CFA offset. +static void buildCFAOffs(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, int Offset, + const SystemZInstrInfo *ZII) { + unsigned CFIIndex = MBB.getParent()->addFrameInst( + MCCFIInstruction::cfiDefCfaOffset(nullptr, -Offset)); + BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); +} + +// Add CFI for the new frame location. +static void buildDefCFAReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, unsigned Reg, + const SystemZInstrInfo *ZII) { + MachineFunction &MF = *MBB.getParent(); + MachineModuleInfo &MMI = MF.getMMI(); + const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); + unsigned RegNum = MRI->getDwarfRegNum(Reg, true); + unsigned CFIIndex = MF.addFrameInst( + MCCFIInstruction::createDefCfaRegister(nullptr, RegNum)); + BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); +} + void SystemZFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); + const SystemZSubtarget &STI = MF.getSubtarget(); + const SystemZTargetLowering &TLI = *STI.getTargetLowering(); MachineFrameInfo &MFFrame = MF.getFrameInfo(); - auto *ZII = - static_cast(MF.getSubtarget().getInstrInfo()); + auto *ZII = static_cast(STI.getInstrInfo()); SystemZMachineFunctionInfo *ZFI = MF.getInfo(); MachineBasicBlock::iterator MBBI = MBB.begin(); MachineModuleInfo &MMI = MF.getMMI(); @@ -462,13 +489,22 @@ // Allocate StackSize bytes. int64_t Delta = -int64_t(StackSize); - emitIncrement(MBB, MBBI, DL, SystemZ::R15D, Delta, ZII); - - // Add CFI for the allocation. - unsigned CFIIndex = MF.addFrameInst( - MCCFIInstruction::cfiDefCfaOffset(nullptr, -SPOffsetFromCFA - Delta)); - BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); + const unsigned ProbeSize = TLI.getStackProbeSize(MF); + bool FreeProbe = (ZFI->getSpillGPRRegs().GPROffset && + (ZFI->getSpillGPRRegs().GPROffset + StackSize) < ProbeSize); + if (!FreeProbe && + MF.getSubtarget().getTargetLowering()->hasInlineStackProbe(MF)) { + // Stack probing may involve looping, but splitting the prologue block + // is not possible at this point since it would invalidate the + // SaveBlocks / RestoreBlocks sets of PEI in the single block function + // case. Build a pseudo to be handled later by inlineStackProbe(). + BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::PROBED_STACKALLOC)) + .addImm(StackSize); + } + else { + emitIncrement(MBB, MBBI, DL, SystemZ::R15D, Delta, ZII); + buildCFAOffs(MBB, MBBI, DL, SPOffsetFromCFA + Delta, ZII); + } SPOffsetFromCFA += Delta; if (StoreBackchain) { @@ -486,11 +522,7 @@ .addReg(SystemZ::R15D); // Add CFI for the new frame location. - unsigned HardFP = MRI->getDwarfRegNum(SystemZ::R11D, true); - unsigned CFIIndex = MF.addFrameInst( - MCCFIInstruction::createDefCfaRegister(nullptr, HardFP)); - BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); + buildDefCFAReg(MBB, MBBI, DL, SystemZ::R11D, ZII); // Mark the FramePtr as live at the beginning of every block except // the entry block. (We'll have marked R11 as live on entry when @@ -583,6 +615,91 @@ } } +void SystemZFrameLowering::inlineStackProbe(MachineFunction &MF, + MachineBasicBlock &PrologMBB) const { + auto *ZII = + static_cast(MF.getSubtarget().getInstrInfo()); + const SystemZSubtarget &STI = MF.getSubtarget(); + const SystemZTargetLowering &TLI = *STI.getTargetLowering(); + + MachineInstr *StackAllocMI = nullptr; + for (MachineInstr &MI : PrologMBB) + if (MI.getOpcode() == SystemZ::PROBED_STACKALLOC) { + StackAllocMI = &MI; + break; + } + if (StackAllocMI == nullptr) + return; + uint64_t StackSize = StackAllocMI->getOperand(0).getImm(); + const unsigned ProbeSize = TLI.getStackProbeSize(MF); + uint64_t NumFullBlocks = StackSize / ProbeSize; + uint64_t Residual = StackSize % ProbeSize; + int64_t SPOffsetFromCFA = -SystemZMC::CFAOffsetFromInitialSP; + MachineBasicBlock *MBB = &PrologMBB; + MachineBasicBlock::iterator MBBI = StackAllocMI; + const DebugLoc DL = StackAllocMI->getDebugLoc(); + + // Allocate a block of Size bytes on the stack and probe it. + auto allocateAndProbe = [&](MachineBasicBlock &InsMBB, + MachineBasicBlock::iterator InsPt, unsigned Size, + bool EmitCFI) -> void { + emitIncrement(InsMBB, InsPt, DL, SystemZ::R15D, -int64_t(Size), ZII); + if (EmitCFI) { + SPOffsetFromCFA -= Size; + buildCFAOffs(InsMBB, InsPt, DL, SPOffsetFromCFA, ZII); + } + // Probe by means of a volatile compare. + MachineMemOperand *MMO = MF.getMachineMemOperand(MachinePointerInfo(), + MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad, 8, Align(1)); + BuildMI(InsMBB, InsPt, DL, ZII->get(SystemZ::CG)) + .addReg(SystemZ::R0D, RegState::Undef) + .addReg(SystemZ::R15D).addImm(Size - 8).addReg(0) + .addMemOperand(MMO); + }; + + if (NumFullBlocks < 3) { + // Emit unrolled probe statements. + for (unsigned int i = 0; i < NumFullBlocks; i++) + allocateAndProbe(*MBB, MBBI, ProbeSize, true/*EmitCFI*/); + } else { + // Emit a loop probing the pages. + uint64_t LoopAlloc = ProbeSize * NumFullBlocks; + SPOffsetFromCFA -= LoopAlloc; + + BuildMI(*MBB, MBBI, DL, ZII->get(SystemZ::LGR), SystemZ::R1D) + .addReg(SystemZ::R15D); + buildDefCFAReg(*MBB, MBBI, DL, SystemZ::R1D, ZII); + emitIncrement(*MBB, MBBI, DL, SystemZ::R1D, -int64_t(LoopAlloc), ZII); + buildCFAOffs(*MBB, MBBI, DL, -int64_t(SystemZMC::CallFrameSize + LoopAlloc), + ZII); + + MachineBasicBlock *DoneMBB = SystemZ::splitBlockBefore(MBBI, MBB); + MachineBasicBlock *LoopMBB = SystemZ::emitBlockAfter(MBB); + MBB->addSuccessor(LoopMBB); + LoopMBB->addSuccessor(LoopMBB); + LoopMBB->addSuccessor(DoneMBB); + + MBB = LoopMBB; + allocateAndProbe(*MBB, MBB->end(), ProbeSize, false/*EmitCFI*/); + BuildMI(*MBB, MBB->end(), DL, ZII->get(SystemZ::CLGR)) + .addReg(SystemZ::R15D).addReg(SystemZ::R1D); + BuildMI(*MBB, MBB->end(), DL, ZII->get(SystemZ::BRC)) + .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_GT).addMBB(MBB); + + MBB = DoneMBB; + MBBI = DoneMBB->begin(); + buildDefCFAReg(*MBB, MBBI, DL, SystemZ::R15D, ZII); + + recomputeLiveIns(*DoneMBB); + recomputeLiveIns(*LoopMBB); + } + + if (Residual) + allocateAndProbe(*MBB, MBBI, Residual, true/*EmitCFI*/); + + StackAllocMI->eraseFromParent(); +} + bool SystemZFrameLowering::hasFP(const MachineFunction &MF) const { return (MF.getTarget().Options.DisableFramePointerElim(MF) || MF.getFrameInfo().hasVarSizedObjects() || diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -83,6 +83,10 @@ // base of the dynamically-allocatable area. ADJDYNALLOC, + // For allocating stack space when using stack clash protector. + // Allocation is performed by block, and each block is probed. + PROBED_ALLOCA, + // Count number of bits set in operand 0 per byte. POPCNT, @@ -428,6 +432,7 @@ EVT VT) const override; bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override; + bool hasInlineStackProbe(MachineFunction &MF) const override; bool isLegalICmpImmediate(int64_t Imm) const override; bool isLegalAddImmediate(int64_t Imm) const override; bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, @@ -556,6 +561,8 @@ return true; } + unsigned getStackProbeSize(MachineFunction &MF) const; + private: const SystemZSubtarget &Subtarget; @@ -691,6 +698,8 @@ MachineBasicBlock *emitLoadAndTestCmp0(MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const; + MachineBasicBlock *emitProbedAlloca(MachineInstr &MI, + MachineBasicBlock *MBB) const; MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override; diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -826,6 +826,15 @@ return SystemZVectorConstantInfo(Imm).isVectorConstantLegal(Subtarget); } +/// Returns true if stack probing through inline assembly is requested. +bool SystemZTargetLowering::hasInlineStackProbe(MachineFunction &MF) const { + // If the function specifically requests inline stack probes, emit them. + if (MF.getFunction().hasFnAttribute("probe-stack")) + return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() == + "inline-asm"; + return false; +} + bool SystemZTargetLowering::isLegalICmpImmediate(int64_t Imm) const { // We can use CGFI or CLGFI. return isInt<32>(Imm) || isUInt<32>(Imm); @@ -3428,10 +3437,17 @@ DAG.getConstant(ExtraAlignSpace, DL, MVT::i64)); // Get the new stack pointer value. - SDValue NewSP = DAG.getNode(ISD::SUB, DL, MVT::i64, OldSP, NeededSpace); - - // Copy the new stack pointer back. - Chain = DAG.getCopyToReg(Chain, DL, SPReg, NewSP); + SDValue NewSP; + if (hasInlineStackProbe(MF)) { + NewSP = DAG.getNode(SystemZISD::PROBED_ALLOCA, DL, + DAG.getVTList(MVT::i64, MVT::Other), Chain, OldSP, NeededSpace); + Chain = NewSP.getValue(1); + } + else { + NewSP = DAG.getNode(ISD::SUB, DL, MVT::i64, OldSP, NeededSpace); + // Copy the new stack pointer back. + Chain = DAG.getCopyToReg(Chain, DL, SPReg, NewSP); + } // The allocated data lives above the 160 bytes allocated for the standard // frame, plus any outgoing stack arguments. We don't know how much that @@ -5400,6 +5416,7 @@ OPCODE(BR_CCMASK); OPCODE(SELECT_CCMASK); OPCODE(ADJDYNALLOC); + OPCODE(PROBED_ALLOCA); OPCODE(POPCNT); OPCODE(SMUL_LOHI); OPCODE(UMUL_LOHI); @@ -6825,38 +6842,29 @@ return 1; } +unsigned +SystemZTargetLowering::getStackProbeSize(MachineFunction &MF) const { + const TargetFrameLowering *TFI = Subtarget.getFrameLowering(); + unsigned StackAlign = TFI->getStackAlignment(); + assert(StackAlign >=1 && isPowerOf2_32(StackAlign) && + "Unexpected stack alignment"); + // The default stack probe size is 4096 if the function has no + // stack-probe-size attribute. + unsigned StackProbeSize = 4096; + const Function &Fn = MF.getFunction(); + if (Fn.hasFnAttribute("stack-probe-size")) + Fn.getFnAttribute("stack-probe-size") + .getValueAsString() + .getAsInteger(0, StackProbeSize); + // Round down to the stack alignment. + StackProbeSize &= ~(StackAlign - 1); + return StackProbeSize ? StackProbeSize : StackAlign; +} + //===----------------------------------------------------------------------===// // Custom insertion //===----------------------------------------------------------------------===// -// Create a new basic block after MBB. -static MachineBasicBlock *emitBlockAfter(MachineBasicBlock *MBB) { - MachineFunction &MF = *MBB->getParent(); - MachineBasicBlock *NewMBB = MF.CreateMachineBasicBlock(MBB->getBasicBlock()); - MF.insert(std::next(MachineFunction::iterator(MBB)), NewMBB); - return NewMBB; -} - -// Split MBB after MI and return the new block (the one that contains -// instructions after MI). -static MachineBasicBlock *splitBlockAfter(MachineBasicBlock::iterator MI, - MachineBasicBlock *MBB) { - MachineBasicBlock *NewMBB = emitBlockAfter(MBB); - NewMBB->splice(NewMBB->begin(), MBB, - std::next(MachineBasicBlock::iterator(MI)), MBB->end()); - NewMBB->transferSuccessorsAndUpdatePHIs(MBB); - return NewMBB; -} - -// Split MBB before MI and return the new block (the one that contains MI). -static MachineBasicBlock *splitBlockBefore(MachineBasicBlock::iterator MI, - MachineBasicBlock *MBB) { - MachineBasicBlock *NewMBB = emitBlockAfter(MBB); - NewMBB->splice(NewMBB->begin(), MBB, MI, MBB->end()); - NewMBB->transferSuccessorsAndUpdatePHIs(MBB); - return NewMBB; -} - // Force base value Base into a register before MI. Return the register. static Register forceReg(MachineInstr &MI, MachineOperand &Base, const SystemZInstrInfo *TII) { @@ -7027,8 +7035,8 @@ bool CCKilled = (LastMI->killsRegister(SystemZ::CC) || checkCCKill(*LastMI, MBB)); MachineBasicBlock *StartMBB = MBB; - MachineBasicBlock *JoinMBB = splitBlockAfter(LastMI, MBB); - MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB); + MachineBasicBlock *JoinMBB = SystemZ::splitBlockAfter(LastMI, MBB); + MachineBasicBlock *FalseMBB = SystemZ::emitBlockAfter(StartMBB); // Unless CC was killed in the last Select instruction, mark it as // live-in to both FalseMBB and JoinMBB. @@ -7121,8 +7129,8 @@ CCMask ^= CCValid; MachineBasicBlock *StartMBB = MBB; - MachineBasicBlock *JoinMBB = splitBlockBefore(MI, MBB); - MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB); + MachineBasicBlock *JoinMBB = SystemZ::splitBlockBefore(MI, MBB); + MachineBasicBlock *FalseMBB = SystemZ::emitBlockAfter(StartMBB); // Unless CC was killed in the CondStore instruction, mark it as // live-in to both FalseMBB and JoinMBB. @@ -7205,8 +7213,8 @@ // Insert a basic block for the main loop. MachineBasicBlock *StartMBB = MBB; - MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB); - MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB); + MachineBasicBlock *DoneMBB = SystemZ::splitBlockBefore(MI, MBB); + MachineBasicBlock *LoopMBB = SystemZ::emitBlockAfter(StartMBB); // StartMBB: // ... @@ -7323,10 +7331,10 @@ // Insert 3 basic blocks for the loop. MachineBasicBlock *StartMBB = MBB; - MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB); - MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB); - MachineBasicBlock *UseAltMBB = emitBlockAfter(LoopMBB); - MachineBasicBlock *UpdateMBB = emitBlockAfter(UseAltMBB); + MachineBasicBlock *DoneMBB = SystemZ::splitBlockBefore(MI, MBB); + MachineBasicBlock *LoopMBB = SystemZ::emitBlockAfter(StartMBB); + MachineBasicBlock *UseAltMBB = SystemZ::emitBlockAfter(LoopMBB); + MachineBasicBlock *UpdateMBB = SystemZ::emitBlockAfter(UseAltMBB); // StartMBB: // ... @@ -7434,9 +7442,9 @@ // Insert 2 basic blocks for the loop. MachineBasicBlock *StartMBB = MBB; - MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB); - MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB); - MachineBasicBlock *SetMBB = emitBlockAfter(LoopMBB); + MachineBasicBlock *DoneMBB = SystemZ::splitBlockBefore(MI, MBB); + MachineBasicBlock *LoopMBB = SystemZ::emitBlockAfter(StartMBB); + MachineBasicBlock *SetMBB = SystemZ::emitBlockAfter(LoopMBB); // StartMBB: // ... @@ -7596,7 +7604,7 @@ // When generating more than one CLC, all but the last will need to // branch to the end when a difference is found. MachineBasicBlock *EndMBB = (Length > 256 && Opcode == SystemZ::CLC ? - splitBlockAfter(MI, MBB) : nullptr); + SystemZ::splitBlockAfter(MI, MBB) : nullptr); // Check for the loop form, in which operand 5 is the trip count. if (MI.getNumExplicitOperands() > 5) { @@ -7620,9 +7628,10 @@ Register NextCountReg = MRI.createVirtualRegister(RC); MachineBasicBlock *StartMBB = MBB; - MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB); - MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB); - MachineBasicBlock *NextMBB = (EndMBB ? emitBlockAfter(LoopMBB) : LoopMBB); + MachineBasicBlock *DoneMBB = SystemZ::splitBlockBefore(MI, MBB); + MachineBasicBlock *LoopMBB = SystemZ::emitBlockAfter(StartMBB); + MachineBasicBlock *NextMBB = + (EndMBB ? SystemZ::emitBlockAfter(LoopMBB) : LoopMBB); // StartMBB: // # fall through to LoopMMB @@ -7738,7 +7747,7 @@ // If there's another CLC to go, branch to the end if a difference // was found. if (EndMBB && Length > 0) { - MachineBasicBlock *NextMBB = splitBlockBefore(MI, MBB); + MachineBasicBlock *NextMBB = SystemZ::splitBlockBefore(MI, MBB); BuildMI(MBB, DL, TII->get(SystemZ::BRC)) .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE) .addMBB(EndMBB); @@ -7778,8 +7787,8 @@ uint64_t End2Reg = MRI.createVirtualRegister(RC); MachineBasicBlock *StartMBB = MBB; - MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB); - MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB); + MachineBasicBlock *DoneMBB = SystemZ::splitBlockBefore(MI, MBB); + MachineBasicBlock *LoopMBB = SystemZ::emitBlockAfter(StartMBB); // StartMBB: // # fall through to LoopMMB @@ -7890,6 +7899,97 @@ return MBB; } +MachineBasicBlock *SystemZTargetLowering::emitProbedAlloca( + MachineInstr &MI, MachineBasicBlock *MBB) const { + MachineFunction &MF = *MBB->getParent(); + MachineRegisterInfo *MRI = &MF.getRegInfo(); + const SystemZInstrInfo *TII = + static_cast(Subtarget.getInstrInfo()); + DebugLoc DL = MI.getDebugLoc(); + const unsigned ProbeSize = getStackProbeSize(MF); + Register DstReg = MI.getOperand(0).getReg(); + Register SizeReg = MI.getOperand(2).getReg(); + + MachineBasicBlock *StartMBB = MBB; + MachineBasicBlock *DoneMBB = SystemZ::splitBlockAfter(MI, MBB); + MachineBasicBlock *LoopTestMBB = SystemZ::emitBlockAfter(StartMBB); + MachineBasicBlock *LoopBodyMBB = SystemZ::emitBlockAfter(LoopTestMBB); + MachineBasicBlock *TailTestMBB = SystemZ::emitBlockAfter(LoopBodyMBB); + MachineBasicBlock *TailMBB = SystemZ::emitBlockAfter(TailTestMBB); + + MachineMemOperand *VolLdMMO = MF.getMachineMemOperand(MachinePointerInfo(), + MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad, 8, Align(1)); + + Register PHIReg = MRI->createVirtualRegister(&SystemZ::ADDR64BitRegClass); + Register IncReg = MRI->createVirtualRegister(&SystemZ::ADDR64BitRegClass); + + // LoopTestMBB + // BRC TailTestMBB + // # fallthrough to LoopBodyMBB + StartMBB->addSuccessor(LoopTestMBB); + MBB = LoopTestMBB; + BuildMI(MBB, DL, TII->get(SystemZ::PHI), PHIReg) + .addReg(SizeReg) + .addMBB(StartMBB) + .addReg(IncReg) + .addMBB(LoopBodyMBB); + BuildMI(MBB, DL, TII->get(SystemZ::CLGFI)) + .addReg(PHIReg) + .addImm(ProbeSize); + BuildMI(MBB, DL, TII->get(SystemZ::BRC)) + .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_LT) + .addMBB(TailTestMBB); + MBB->addSuccessor(LoopBodyMBB); + MBB->addSuccessor(TailTestMBB); + + // LoopBodyMBB: Allocate and probe by means of a volatile compare. + // J LoopTestMBB + MBB = LoopBodyMBB; + BuildMI(MBB, DL, TII->get(SystemZ::SLGFI), IncReg) + .addReg(PHIReg) + .addImm(ProbeSize); + BuildMI(MBB, DL, TII->get(SystemZ::SLGFI), SystemZ::R15D) + .addReg(SystemZ::R15D) + .addImm(ProbeSize); + BuildMI(MBB, DL, TII->get(SystemZ::CG)).addReg(SystemZ::R15D) + .addReg(SystemZ::R15D).addImm(ProbeSize - 8).addReg(0) + .setMemRefs(VolLdMMO); + BuildMI(MBB, DL, TII->get(SystemZ::J)).addMBB(LoopTestMBB); + MBB->addSuccessor(LoopTestMBB); + + // TailTestMBB + // BRC DoneMBB + // # fallthrough to TailMBB + MBB = TailTestMBB; + BuildMI(MBB, DL, TII->get(SystemZ::CGHI)) + .addReg(PHIReg) + .addImm(0); + BuildMI(MBB, DL, TII->get(SystemZ::BRC)) + .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ) + .addMBB(DoneMBB); + MBB->addSuccessor(TailMBB); + MBB->addSuccessor(DoneMBB); + + // TailMBB + // # fallthrough to DoneMBB + MBB = TailMBB; + BuildMI(MBB, DL, TII->get(SystemZ::SLGR), SystemZ::R15D) + .addReg(SystemZ::R15D) + .addReg(PHIReg); + BuildMI(MBB, DL, TII->get(SystemZ::CG)).addReg(SystemZ::R15D) + .addReg(SystemZ::R15D).addImm(-8).addReg(PHIReg) + .setMemRefs(VolLdMMO); + MBB->addSuccessor(DoneMBB); + + // DoneMBB + MBB = DoneMBB; + BuildMI(*MBB, MBB->begin(), DL, TII->get(TargetOpcode::COPY), DstReg) + .addReg(SystemZ::R15D); + + MI.eraseFromParent(); + return DoneMBB; +} + MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter( MachineInstr &MI, MachineBasicBlock *MBB) const { switch (MI.getOpcode()) { @@ -8150,6 +8250,9 @@ case SystemZ::LTXBRCompare_VecPseudo: return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTXBR); + case SystemZ::PROBED_ALLOCA: + return emitProbedAlloca(MI, MBB); + case TargetOpcode::STACKMAP: case TargetOpcode::PATCHPOINT: return emitPatchPoint(MI, MBB); diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h --- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h @@ -159,6 +159,16 @@ // Return a version of comparison CC mask CCMask in which the LT and GT // actions are swapped. unsigned reverseCCMask(unsigned CCMask); + +// Create a new basic block after MBB. +MachineBasicBlock *emitBlockAfter(MachineBasicBlock *MBB); +// Split MBB after MI and return the new block (the one that contains +// instructions after MI). +MachineBasicBlock *splitBlockAfter(MachineBasicBlock::iterator MI, + MachineBasicBlock *MBB); +// Split MBB before MI and return the new block (the one that contains MI). +MachineBasicBlock *splitBlockBefore(MachineBasicBlock::iterator MI, + MachineBasicBlock *MBB); } class SystemZInstrInfo : public SystemZGenInstrInfo { diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp --- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -1872,6 +1872,30 @@ (CCMask & SystemZ::CCMASK_CMP_UO)); } +MachineBasicBlock *SystemZ::emitBlockAfter(MachineBasicBlock *MBB) { + MachineFunction &MF = *MBB->getParent(); + MachineBasicBlock *NewMBB = MF.CreateMachineBasicBlock(MBB->getBasicBlock()); + MF.insert(std::next(MachineFunction::iterator(MBB)), NewMBB); + return NewMBB; +} + +MachineBasicBlock *SystemZ::splitBlockAfter(MachineBasicBlock::iterator MI, + MachineBasicBlock *MBB) { + MachineBasicBlock *NewMBB = emitBlockAfter(MBB); + NewMBB->splice(NewMBB->begin(), MBB, + std::next(MachineBasicBlock::iterator(MI)), MBB->end()); + NewMBB->transferSuccessorsAndUpdatePHIs(MBB); + return NewMBB; +} + +MachineBasicBlock *SystemZ::splitBlockBefore(MachineBasicBlock::iterator MI, + MachineBasicBlock *MBB) { + MachineBasicBlock *NewMBB = emitBlockAfter(MBB); + NewMBB->splice(NewMBB->begin(), MBB, MI, MBB->end()); + NewMBB->transferSuccessorsAndUpdatePHIs(MBB); + return NewMBB; +} + unsigned SystemZInstrInfo::getLoadAndTrap(unsigned Opcode) const { if (!STI.hasLoadAndTrap()) return 0; diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td --- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td +++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td @@ -29,6 +29,15 @@ def ADJDYNALLOC : Pseudo<(outs GR64:$dst), (ins dynalloc12only:$src), [(set GR64:$dst, dynalloc12only:$src)]>; +let Defs = [R15D, CC], Uses = [R15D], hasNoSchedulingInfo = 1, + usesCustomInserter = 1 in + def PROBED_ALLOCA : Pseudo<(outs GR64:$dst), + (ins GR64:$oldSP, GR64:$space), + [(set GR64:$dst, (z_probed_alloca GR64:$oldSP, GR64:$space))]>; + +let Defs = [R1D, R15D, CC], Uses = [R15D], hasNoSchedulingInfo = 1, + hasSideEffects = 1 in + def PROBED_STACKALLOC : Pseudo<(outs), (ins i64imm:$stacksize), []>; //===----------------------------------------------------------------------===// // Branch instructions diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td --- a/llvm/lib/Target/SystemZ/SystemZOperators.td +++ b/llvm/lib/Target/SystemZ/SystemZOperators.td @@ -40,6 +40,10 @@ SDTCisSameAs<0, 2>, SDTCisPtrTy<0>]>; def SDT_ZAdjDynAlloc : SDTypeProfile<1, 0, [SDTCisVT<0, i64>]>; +def SDT_ZProbedAlloca : SDTypeProfile<1, 2, + [SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisPtrTy<0>]>; def SDT_ZGR128Binary : SDTypeProfile<1, 2, [SDTCisVT<0, untyped>, SDTCisInt<1>, @@ -269,6 +273,8 @@ SDT_ZSelectCCMask>; def z_ipm_1 : SDNode<"SystemZISD::IPM", SDT_ZIPM>; def z_adjdynalloc : SDNode<"SystemZISD::ADJDYNALLOC", SDT_ZAdjDynAlloc>; +def z_probed_alloca : SDNode<"SystemZISD::PROBED_ALLOCA", SDT_ZProbedAlloca, + [SDNPHasChain]>; def z_popcnt : SDNode<"SystemZISD::POPCNT", SDTIntUnaryOp>; def z_smul_lohi : SDNode<"SystemZISD::SMUL_LOHI", SDT_ZGR128Binary>; def z_umul_lohi : SDNode<"SystemZISD::UMUL_LOHI", SDT_ZGR128Binary>; diff --git a/llvm/test/CodeGen/SystemZ/stack-clash-dynamic-alloca.ll b/llvm/test/CodeGen/SystemZ/stack-clash-dynamic-alloca.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/stack-clash-dynamic-alloca.ll @@ -0,0 +1,136 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +define i32 @fun0(i32 %n) #0 { +; CHECK-LABEL: fun0: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgr %r11, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r11 +; CHECK-NEXT: # kill: def $r2l killed $r2l def $r2d +; CHECK-NEXT: risbgn %r1, %r2, 30, 189, 2 +; CHECK-NEXT: la %r0, 7(%r1) +; CHECK-NEXT: risbgn %r1, %r0, 29, 188, 0 +; CHECK-NEXT: clgfi %r1, 4096 +; CHECK-NEXT: jl .LBB0_2 +; CHECK-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: slgfi %r1, 4096 +; CHECK-NEXT: slgfi %r15, 4096 +; CHECK-NEXT: cg %r15, 4088(%r15) +; CHECK-NEXT: clgfi %r1, 4096 +; CHECK-NEXT: jhe .LBB0_1 +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: cgije %r1, 0, .LBB0_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: slgr %r15, %r1 +; CHECK-NEXT: cg %r15, -8(%r1,%r15) +; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: la %r1, 160(%r15) +; CHECK-NEXT: lhi %r0, 1 +; CHECK-NEXT: sty %r0, 4792(%r1) +; CHECK-NEXT: l %r2, 0(%r1) +; CHECK-NEXT: lmg %r11, %r15, 248(%r11) +; CHECK-NEXT: br %r14 + + %a = alloca i32, i32 %n + %b = getelementptr inbounds i32, i32* %a, i64 1198 + store volatile i32 1, i32* %b + %c = load volatile i32, i32* %a + ret i32 %c +} + +; Probe size should be modulo stack alignment. +define i32 @fun1(i32 %n) #0 "stack-probe-size"="1250" { +; CHECK-LABEL: fun1: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgr %r11, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r11 +; CHECK-NEXT: # kill: def $r2l killed $r2l def $r2d +; CHECK-NEXT: risbgn %r1, %r2, 30, 189, 2 +; CHECK-NEXT: la %r0, 7(%r1) +; CHECK-NEXT: risbgn %r1, %r0, 29, 188, 0 +; CHECK-NEXT: clgfi %r1, 1248 +; CHECK-NEXT: jl .LBB1_2 +; CHECK-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: slgfi %r1, 1248 +; CHECK-NEXT: slgfi %r15, 1248 +; CHECK-NEXT: cg %r15, 1240(%r15) +; CHECK-NEXT: clgfi %r1, 1248 +; CHECK-NEXT: jhe .LBB1_1 +; CHECK-NEXT: .LBB1_2: +; CHECK-NEXT: cgije %r1, 0, .LBB1_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: slgr %r15, %r1 +; CHECK-NEXT: cg %r15, -8(%r1,%r15) +; CHECK-NEXT: .LBB1_4: +; CHECK-NEXT: la %r1, 160(%r15) +; CHECK-NEXT: lhi %r0, 1 +; CHECK-NEXT: sty %r0, 4792(%r1) +; CHECK-NEXT: l %r2, 0(%r1) +; CHECK-NEXT: lmg %r11, %r15, 248(%r11) +; CHECK-NEXT: br %r14 + %a = alloca i32, i32 %n + %b = getelementptr inbounds i32, i32* %a, i64 1198 + store volatile i32 1, i32* %b + %c = load volatile i32, i32* %a + ret i32 %c +} + +; The minimum probe size is the stack alignment. +define i32 @fun2(i32 %n) #0 "stack-probe-size"="4" { +; CHECK-LABEL: fun2: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r1 +; CHECK-NEXT: aghi %r1, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: aghi %r15, -8 +; CHECK-NEXT: cg %r0, 0(%r15) +; CHECK-NEXT: clgrjh %r15, %r1, .LBB2_1 +; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: .cfi_def_cfa_register %r15 +; CHECK-NEXT: lgr %r11, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r11 +; CHECK-NEXT: # kill: def $r2l killed $r2l def $r2d +; CHECK-NEXT: risbgn %r1, %r2, 30, 189, 2 +; CHECK-NEXT: la %r0, 7(%r1) +; CHECK-NEXT: risbgn %r1, %r0, 29, 188, 0 +; CHECK-NEXT: clgijl %r1, 8, .LBB2_4 +; CHECK-NEXT: .LBB2_3: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: slgfi %r1, 8 +; CHECK-NEXT: slgfi %r15, 8 +; CHECK-NEXT: cg %r15, 0(%r15) +; CHECK-NEXT: clgijhe %r1, 8, .LBB2_3 +; CHECK-NEXT: .LBB2_4: +; CHECK-NEXT: cgije %r1, 0, .LBB2_6 +; CHECK-NEXT: # %bb.5: +; CHECK-NEXT: slgr %r15, %r1 +; CHECK-NEXT: cg %r15, -8(%r1,%r15) +; CHECK-NEXT: .LBB2_6: +; CHECK-NEXT: la %r1, 160(%r15) +; CHECK-NEXT: lhi %r0, 1 +; CHECK-NEXT: sty %r0, 4792(%r1) +; CHECK-NEXT: l %r2, 0(%r1) +; CHECK-NEXT: lmg %r11, %r15, 248(%r11) +; CHECK-NEXT: br %r14 + %a = alloca i32, i32 %n + %b = getelementptr inbounds i32, i32* %a, i64 1198 + store volatile i32 1, i32* %b + %c = load volatile i32, i32* %a + ret i32 %c +} + +attributes #0 = {"probe-stack"="inline-asm"} diff --git a/llvm/test/CodeGen/SystemZ/stack-clash-protection.ll b/llvm/test/CodeGen/SystemZ/stack-clash-protection.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/stack-clash-protection.ll @@ -0,0 +1,242 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 -O3 | FileCheck %s +; +; Test stack clash protection probing for static allocas. + +; Small: one probe. +define i32 @fun0() #0 { +; CHECK-LABEL: fun0: +; CHECK: # %bb.0: +; CHECK-NEXT: aghi %r15, -560 +; CHECK-NEXT: .cfi_def_cfa_offset 720 +; CHECK-NEXT: cg %r0, 552(%r15) +; CHECK-NEXT: mvhi 552(%r15), 1 +; CHECK-NEXT: l %r2, 160(%r15) +; CHECK-NEXT: aghi %r15, 560 +; CHECK-NEXT: br %r14 + + %a = alloca i32, i64 100 + %b = getelementptr inbounds i32, i32* %a, i64 98 + store volatile i32 1, i32* %b + %c = load volatile i32, i32* %a + ret i32 %c +} + +; Medium: two probes. +define i32 @fun1() #0 { +; CHECK-LABEL: fun1: +; CHECK: # %bb.0: +; CHECK-NEXT: aghi %r15, -4096 +; CHECK-NEXT: .cfi_def_cfa_offset 4256 +; CHECK-NEXT: cg %r0, 4088(%r15) +; CHECK-NEXT: aghi %r15, -4080 +; CHECK-NEXT: .cfi_def_cfa_offset 8336 +; CHECK-NEXT: cg %r0, 4072(%r15) +; CHECK-NEXT: mvhi 976(%r15), 1 +; CHECK-NEXT: l %r2, 176(%r15) +; CHECK-NEXT: aghi %r15, 8176 +; CHECK-NEXT: br %r14 + + %a = alloca i32, i64 2000 + %b = getelementptr inbounds i32, i32* %a, i64 200 + store volatile i32 1, i32* %b + %c = load volatile i32, i32* %a + ret i32 %c +} + +; Large: Use a loop to allocate and probe in steps. +define i32 @fun2() #0 { +; CHECK-LABEL: fun2: +; CHECK: # %bb.0: +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r1 +; CHECK-NEXT: agfi %r1, -69632 +; CHECK-NEXT: .cfi_def_cfa_offset 69792 +; CHECK-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: aghi %r15, -4096 +; CHECK-NEXT: cg %r0, 4088(%r15) +; CHECK-NEXT: clgrjh %r15, %r1, .LBB2_1 +; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: .cfi_def_cfa_register %r15 +; CHECK-NEXT: aghi %r15, -2544 +; CHECK-NEXT: .cfi_def_cfa_offset 72336 +; CHECK-NEXT: cg %r0, 2536(%r15) +; CHECK-NEXT: lhi %r0, 1 +; CHECK-NEXT: mvhi 568(%r15), 1 +; CHECK-NEXT: sty %r0, 28968(%r15) +; CHECK-NEXT: l %r2, 176(%r15) +; CHECK-NEXT: agfi %r15, 72176 +; CHECK-NEXT: br %r14 + + %a = alloca i32, i64 18000 + %b0 = getelementptr inbounds i32, i32* %a, i64 98 + %b1 = getelementptr inbounds i32, i32* %a, i64 7198 + store volatile i32 1, i32* %b0 + store volatile i32 1, i32* %b1 + %c = load volatile i32, i32* %a + ret i32 %c +} + +; Ends evenly on the step so no remainder needed. +define void @fun3() #0 { +; CHECK-LABEL: fun3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r1 +; CHECK-NEXT: aghi %r1, -28672 +; CHECK-NEXT: .cfi_def_cfa_offset 28832 +; CHECK-NEXT: .LBB3_1: # %entry +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: aghi %r15, -4096 +; CHECK-NEXT: cg %r0, 4088(%r15) +; CHECK-NEXT: clgrjh %r15, %r1, .LBB3_1 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: .cfi_def_cfa_register %r15 +; CHECK-NEXT: mvhi 180(%r15), 0 +; CHECK-NEXT: l %r0, 180(%r15) +; CHECK-NEXT: aghi %r15, 28672 +; CHECK-NEXT: br %r14 +entry: + %stack = alloca [7122 x i32], align 4 + %i = alloca i32, align 4 + %0 = bitcast [7122 x i32]* %stack to i8* + %i.0.i.0..sroa_cast = bitcast i32* %i to i8* + store volatile i32 0, i32* %i, align 4 + %i.0.i.0.6 = load volatile i32, i32* %i, align 4 + ret void +} + +; Loop with bigger step. +define void @fun4() #0 "stack-probe-size"="8192" { +; CHECK-LABEL: fun4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r1 +; CHECK-NEXT: aghi %r1, -24576 +; CHECK-NEXT: .cfi_def_cfa_offset 24736 +; CHECK-NEXT: .LBB4_1: # %entry +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: aghi %r15, -8192 +; CHECK-NEXT: cg %r0, 8184(%r15) +; CHECK-NEXT: clgrjh %r15, %r1, .LBB4_1 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: .cfi_def_cfa_register %r15 +; CHECK-NEXT: aghi %r15, -7608 +; CHECK-NEXT: .cfi_def_cfa_offset 32344 +; CHECK-NEXT: cg %r0, 7600(%r15) +; CHECK-NEXT: mvhi 180(%r15), 0 +; CHECK-NEXT: l %r0, 180(%r15) +; CHECK-NEXT: aghi %r15, 32184 +; CHECK-NEXT: br %r14 +entry: + %stack = alloca [8000 x i32], align 4 + %i = alloca i32, align 4 + %0 = bitcast [8000 x i32]* %stack to i8* + %i.0.i.0..sroa_cast = bitcast i32* %i to i8* + store volatile i32 0, i32* %i, align 4 + %i.0.i.0.6 = load volatile i32, i32* %i, align 4 + ret void +} + +; Probe size should be modulo stack alignment. +define void @fun5() #0 "stack-probe-size"="4100" { +; CHECK-LABEL: fun5: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: aghi %r15, -4096 +; CHECK-NEXT: .cfi_def_cfa_offset 4256 +; CHECK-NEXT: cg %r0, 4088(%r15) +; CHECK-NEXT: aghi %r15, -88 +; CHECK-NEXT: .cfi_def_cfa_offset 4344 +; CHECK-NEXT: cg %r0, 80(%r15) +; CHECK-NEXT: mvhi 180(%r15), 0 +; CHECK-NEXT: l %r0, 180(%r15) +; CHECK-NEXT: aghi %r15, 4184 +; CHECK-NEXT: br %r14 +entry: + %stack = alloca [1000 x i32], align 4 + %i = alloca i32, align 4 + %0 = bitcast [1000 x i32]* %stack to i8* + %i.0.i.0..sroa_cast = bitcast i32* %i to i8* + store volatile i32 0, i32* %i, align 4 + %i.0.i.0.6 = load volatile i32, i32* %i, align 4 + ret void +} + +; The minimum probe size is the stack alignment. +define void @fun6() #0 "stack-probe-size"="5" { +; CHECK-LABEL: fun6: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r1 +; CHECK-NEXT: aghi %r1, -4184 +; CHECK-NEXT: .cfi_def_cfa_offset 4344 +; CHECK-NEXT: .LBB6_1: # %entry +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: aghi %r15, -8 +; CHECK-NEXT: cg %r0, 0(%r15) +; CHECK-NEXT: clgrjh %r15, %r1, .LBB6_1 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: .cfi_def_cfa_register %r15 +; CHECK-NEXT: mvhi 180(%r15), 0 +; CHECK-NEXT: l %r0, 180(%r15) +; CHECK-NEXT: aghi %r15, 4184 +; CHECK-NEXT: br %r14 +entry: + %stack = alloca [1000 x i32], align 4 + %i = alloca i32, align 4 + %0 = bitcast [1000 x i32]* %stack to i8* + %i.0.i.0..sroa_cast = bitcast i32* %i to i8* + store volatile i32 0, i32* %i, align 4 + %i.0.i.0.6 = load volatile i32, i32* %i, align 4 + ret void +} + +; Small with a natural probe (STMG) - needs no extra probe. +define i32 @fun7() #0 { +; CHECK-LABEL: fun7: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -3976 +; CHECK-NEXT: .cfi_def_cfa_offset 4136 +; CHECK-NEXT: brasl %r14, foo@PLT +; CHECK-NEXT: st %r2, 568(%r15) +; CHECK-NEXT: l %r2, 176(%r15) +; CHECK-NEXT: lmg %r14, %r15, 4088(%r15) +; CHECK-NEXT: br %r14 + %v = call i32 @foo() + %a = alloca i32, i64 950 + %b = getelementptr inbounds i32, i32* %a, i64 98 + store volatile i32 %v, i32* %b + %c = load volatile i32, i32* %a + ret i32 %c +} + +; Medium with an STMG - still needs probing. +define i32 @fun8() #0 { +; CHECK-LABEL: fun8: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -3984 +; CHECK-NEXT: .cfi_def_cfa_offset 4144 +; CHECK-NEXT: cg %r0, 3976(%r15) +; CHECK-NEXT: brasl %r14, foo@PLT +; CHECK-NEXT: st %r2, 976(%r15) +; CHECK-NEXT: l %r2, 176(%r15) +; CHECK-NEXT: lmg %r14, %r15, 4096(%r15) +; CHECK-NEXT: br %r14 + + %v = call i32 @foo() + %a = alloca i32, i64 952 + %b = getelementptr inbounds i32, i32* %a, i64 200 + store volatile i32 %v, i32* %b + %c = load volatile i32, i32* %a + ret i32 %c +} + +declare i32 @foo() +attributes #0 = { "probe-stack"="inline-asm" } +