diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -70,7 +70,9 @@ New Compiler Flags ------------------ -- ... +- -fstack-clash-protection will provide a protection against the stack clash + attack for x86 architecture through automatic probing of each page of + allocated stack. Deprecated Compiler Flags ------------------------- diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -135,6 +135,7 @@ ///< enabled. CODEGENOPT(NoWarn , 1, 0) ///< Set when -Wa,--no-warn is enabled. CODEGENOPT(EnableSegmentedStacks , 1, 0) ///< Set when -fsplit-stack is enabled. +CODEGENOPT(StackClashProtector, 1, 0) ///< Set when -fstack-clash-protection is enabled. CODEGENOPT(NoImplicitFloat , 1, 0) ///< Set when -mno-implicit-float is enabled. CODEGENOPT(NoInfsFPMath , 1, 0) ///< Assume FP arguments, results not +-Inf. CODEGENOPT(NoSignedZeros , 1, 0) ///< Allow ignoring the signedness of FP zero diff --git a/clang/include/clang/Basic/DiagnosticFrontendKinds.td b/clang/include/clang/Basic/DiagnosticFrontendKinds.td --- a/clang/include/clang/Basic/DiagnosticFrontendKinds.td +++ b/clang/include/clang/Basic/DiagnosticFrontendKinds.td @@ -17,6 +17,9 @@ def err_fe_inline_asm : Error<"%0">, CatInlineAsm; def warn_fe_inline_asm : Warning<"%0">, CatInlineAsm, InGroup; +def warn_fe_stack_clash_protection_inline_asm : Warning< + "Unable to protect inline asm that clobbers stack pointer against stack clash">, + CatInlineAsm, InGroup; def note_fe_inline_asm : Note<"%0">, CatInlineAsm; def note_fe_inline_asm_here : Note<"instantiated into assembly here">; def err_fe_cannot_link_module : Error<"cannot link module '%0': %1">, diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h --- a/clang/include/clang/Basic/TargetInfo.h +++ b/clang/include/clang/Basic/TargetInfo.h @@ -815,6 +815,8 @@ StringRef getNormalizedGCCRegisterName(StringRef Name, bool ReturnCanonical = false) const; + virtual const char *getSPRegName() const { return nullptr; } + /// Extracts a register from the passed constraint (if it is a /// single-register constraint) and the asm label expression related to a /// variable in the input or output list of an inline asm statement. diff --git a/clang/include/clang/Driver/CC1Options.td b/clang/include/clang/Driver/CC1Options.td --- a/clang/include/clang/Driver/CC1Options.td +++ b/clang/include/clang/Driver/CC1Options.td @@ -736,6 +736,8 @@ HelpText<"Enable stack protectors">; def stack_protector_buffer_size : Separate<["-"], "stack-protector-buffer-size">, HelpText<"Lower bound for a buffer to be considered for stack protection">; +def stack_clash_protection : Separate<["-"], "stack-clash-protection">, + HelpText<"Enable stack clash protection">; def fvisibility : Separate<["-"], "fvisibility">, HelpText<"Default type and symbol visibility">; def ftype_visibility : Separate<["-"], "ftype-visibility">, diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1706,6 +1706,8 @@ def fsplit_stack : Flag<["-"], "fsplit-stack">, Group; def fstack_protector_all : Flag<["-"], "fstack-protector-all">, Group, HelpText<"Enable stack protectors for all functions">; +def fstack_clash_protection : Flag<["-"], "fstack-clash-protection">, Group, + HelpText<"Enable stack clash protection">; def fstack_protector_strong : Flag<["-"], "fstack-protector-strong">, Group, HelpText<"Enable stack protectors for some functions vulnerable to stack smashing. " "Compared to -fstack-protector, this uses a stronger heuristic " diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -149,6 +149,8 @@ ArrayRef getGCCAddlRegNames() const override; + const char *getSPRegName() const override { return "rsp"; } + bool validateCpuSupports(StringRef Name) const override; bool validateCpuIs(StringRef Name) const override; diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp --- a/clang/lib/CodeGen/CGStmt.cpp +++ b/clang/lib/CodeGen/CGStmt.cpp @@ -10,14 +10,16 @@ // //===----------------------------------------------------------------------===// -#include "CodeGenFunction.h" #include "CGDebugInfo.h" +#include "CodeGenFunction.h" #include "CodeGenModule.h" #include "TargetInfo.h" #include "clang/AST/StmtVisitor.h" #include "clang/Basic/Builtins.h" #include "clang/Basic/PrettyStackTrace.h" #include "clang/Basic/TargetInfo.h" +#include "clang/Driver/DriverDiagnostic.h" +#include "clang/Frontend/FrontendDiagnostic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/InlineAsm.h" @@ -2229,8 +2231,14 @@ if (Clobber == "memory") ReadOnly = ReadNone = false; - else if (Clobber != "cc") + else if (Clobber != "cc") { Clobber = getTarget().getNormalizedGCCRegisterName(Clobber); + if (CGM.getCodeGenOpts().StackClashProtector && + Clobber == getTarget().getSPRegName()) { + CGM.getDiags().Report(S.getAsmLoc(), + diag::warn_fe_stack_clash_protection_inline_asm); + } + } if (!Constraints.empty()) Constraints += ','; diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -1486,6 +1486,9 @@ if (CodeGenOpts.UnwindTables) B.addAttribute(llvm::Attribute::UWTable); + if (CodeGenOpts.StackClashProtector) + B.addAttribute("probe-stack", "inline-asm"); + if (!hasUnwindExceptions(LangOpts)) B.addAttribute(llvm::Attribute::NoUnwind); diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -2603,6 +2603,29 @@ } } +static void RenderSCPOptions(const ToolChain &TC, const ArgList &Args, + ArgStringList &CmdArgs) { + const llvm::Triple &EffectiveTriple = TC.getEffectiveTriple(); + + for (const Arg *A : Args) { + switch (A->getOption().getID()) { + default: + continue; + case options::OPT_fstack_clash_protection: { + switch (EffectiveTriple.getArch()) { + default: + return; + case llvm::Triple::ArchType::x86: + case llvm::Triple::ArchType::x86_64: + break; + } + A->claim(); + CmdArgs.push_back("-stack-clash-protection"); + } + } + } +} + static void RenderTrivialAutoVarInitOptions(const Driver &D, const ToolChain &TC, const ArgList &Args, @@ -4722,6 +4745,7 @@ CmdArgs.push_back(Args.MakeArgString("-mspeculative-load-hardening")); RenderSSPOptions(TC, Args, CmdArgs, KernelOrKext); + RenderSCPOptions(TC, Args, CmdArgs); RenderTrivialAutoVarInitOptions(D, TC, Args, CmdArgs); // Translate -mstackrealign diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -1213,6 +1213,8 @@ Opts.NoStackArgProbe = Args.hasArg(OPT_mno_stack_arg_probe); + Opts.StackClashProtector = Args.hasArg(OPT_stack_clash_protection); + if (Arg *A = Args.getLastArg(OPT_fobjc_dispatch_method_EQ)) { StringRef Name = A->getValue(); unsigned Method = llvm::StringSwitch(Name) diff --git a/clang/test/CodeGen/stack-clash-protection.c b/clang/test/CodeGen/stack-clash-protection.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/stack-clash-protection.c @@ -0,0 +1,34 @@ +// check interaction between -fstack-clash-protection and dynamic allocation schemes +// RUN: %clang -target x86_64 -O0 -o %t.out %s -fstack-clash-protection && %t.out + +#include +#include + +int large_stack() __attribute__((noinline)); + +int large_stack() { + int stack[20000], i; + for (i = 0; i < sizeof(stack) / sizeof(int); ++i) + stack[i] = i; + return stack[1]; +} + +int main(int argc, char **argv) { + int volatile static_mem[8000]; + for (size_t i = 0; i < argc * sizeof(static_mem) / sizeof(static_mem[0]); ++i) + static_mem[i] = argc * i; + + int vla[argc]; + memset(&vla[0], 0, argc); + + int index = large_stack(); + + // also check allocation of 0 size + volatile void *mem = __builtin_alloca(argc - 1); + + int volatile *dyn_mem = alloca(sizeof(static_mem) * argc); + for (size_t i = 0; i < argc * sizeof(static_mem) / sizeof(static_mem[0]); ++i) + dyn_mem[i] = argc * i; + + return static_mem[(7999 * argc) / 2] - dyn_mem[(7999 * argc) / 2] + vla[argc - index]; +} diff --git a/clang/test/Driver/stack-clash-protection.c b/clang/test/Driver/stack-clash-protection.c new file mode 100644 --- /dev/null +++ b/clang/test/Driver/stack-clash-protection.c @@ -0,0 +1,22 @@ +// RUN: %clang -target i386-unknown-linux -fstack-clash-protection -### %s 2>&1 | FileCheck %s -check-prefix=SCP-i386 +// SCP-i386: "-stack-clash-protection" + +// RUN: %clang -target x86_64-scei-ps4 -fstack-clash-protection -### %s 2>&1 | FileCheck %s -check-prefix=SCP-x86 +// SCP-x86: "-stack-clash-protection" + +// RUN: %clang -target armv7k-apple-watchos2.0 -fstack-clash-protection -### %s 2>&1 | FileCheck %s -check-prefix=SCP-armv7 +// SCP-armv7-NOT: "-stack-clash-protection" + +// RUN: %clang -target x86_64-unknown-linux -fstack-clash-protection -c %s 2>&1 | FileCheck %s -check-prefix=SCP-warn +// SCP-warn: warning: Unable to protect inline asm that clobbers stack pointer against stack clash + +int foo(int c) { + int r; + __asm__("sub %0, %%rsp" + : + : "rm"(c) + : "rsp"); + __asm__("mov %%rsp, %0" + : "=rm"(r)::); + return r; +} diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -101,6 +101,10 @@ decrease in CPU frequency on these CPUs. This can be re-enabled by passing -mprefer-vector-width=512 to clang or passing -mattr=-prefer-256-bit to llc. +* Functions with the probe-stack attribute set to "inline-asm" are now protected + against stack clash without the need of a third-party probing function and + with limited impact on performance. + Changes to the AMDGPU Target ----------------------------- diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -1659,6 +1659,10 @@ /// Returns the name of the symbol used to emit stack probes or the empty /// string if not applicable. + virtual bool hasStackProbeSymbol(MachineFunction &MF) const { return false; } + + virtual bool hasInlineStackProbe(MachineFunction &MF) const { return false; } + virtual StringRef getStackProbeSymbolName(MachineFunction &MF) const { return ""; } diff --git a/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/llvm/lib/Target/X86/X86CallFrameOptimization.cpp --- a/llvm/lib/Target/X86/X86CallFrameOptimization.cpp +++ b/llvm/lib/Target/X86/X86CallFrameOptimization.cpp @@ -162,14 +162,13 @@ // memory for arguments. unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode(); unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); - bool UseStackProbe = - !STI->getTargetLowering()->getStackProbeSymbolName(MF).empty(); + bool EmitStackProbeCall = STI->getTargetLowering()->hasStackProbeSymbol(MF); unsigned StackProbeSize = STI->getTargetLowering()->getStackProbeSize(MF); for (MachineBasicBlock &BB : MF) { bool InsideFrameSequence = false; for (MachineInstr &MI : BB) { if (MI.getOpcode() == FrameSetupOpcode) { - if (TII->getFrameSize(MI) >= StackProbeSize && UseStackProbe) + if (TII->getFrameSize(MI) >= StackProbeSize && EmitStackProbeCall) return false; if (InsideFrameSequence) return false; diff --git a/llvm/lib/Target/X86/X86FrameLowering.h b/llvm/lib/Target/X86/X86FrameLowering.h --- a/llvm/lib/Target/X86/X86FrameLowering.h +++ b/llvm/lib/Target/X86/X86FrameLowering.h @@ -128,6 +128,25 @@ void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, const DebugLoc &DL, int64_t NumBytes, bool InEpilogue) const; + class FreeStackProbe { + MachineBasicBlock::iterator MBBI; + int64_t Offset = -1; + + public: + FreeStackProbe() = default; + FreeStackProbe(MachineBasicBlock::iterator MBBI, int64_t Offset) + : MBBI{MBBI}, Offset{Offset} {}; + explicit operator bool() const { return Offset != -1; } + void shiftBy(int64_t ShiftValue); + int64_t getOffset() const { return Offset; } + MachineBasicBlock::iterator getIterator() const { return MBBI; } + }; + + FreeStackProbe findFreeStackProbe(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + int64_t StackSize, int64_t MinOffset, + int64_t MaxOffset) const; + /// Check that LEA can be used on SP in an epilogue sequence for \p MF. bool canUseLEAForSPInEpilogue(const MachineFunction &MF) const; @@ -189,11 +208,33 @@ void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const; + void emitStackProbeInlineWindowsCoreCLR64(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, + bool InProlog) const; + void emitStackProbeInlineGeneric(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, bool InProlog) const; + + void emitStackProbeInlineGenericBlock(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, + uint64_t Offset) const; + + void emitStackProbeInlineGenericLoop(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, + uint64_t Offset) const; /// Emit a stub to later inline the target stack probe. - void emitStackProbeInlineStub(MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, bool InProlog) const; + MachineInstr *emitStackProbeInlineStub(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, + bool InProlog) const; /// Aligns the stack pointer by ANDing it with -MaxAlign. void BuildStackAlignAND(MachineBasicBlock &MBB, diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -17,6 +17,7 @@ #include "X86Subtarget.h" #include "X86TargetMachine.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -32,6 +33,13 @@ #include "llvm/Target/TargetOptions.h" #include +#define DEBUG_TYPE "x86-fl" + +STATISTIC(NumFrameLoopProbe, "Number of loop stack probes used in prologue"); +STATISTIC(NumFrameFreeProbe, "Number of free stack probes used in prologue"); +STATISTIC(NumFrameExtraProbe, + "Number of extra stack probes generated in prologue"); + using namespace llvm; X86FrameLowering::X86FrameLowering(const X86Subtarget &STI, @@ -244,6 +252,122 @@ return false; } +/// Maps an opcode that performs a memory operation to the operand holding the +/// memory reference, or return -1 if opcode is not supported. +static bool isUnconditionalMove(unsigned Opcode) { + switch (Opcode) { + case X86::MOV8mi: + case X86::MOV16mi: + case X86::MOV32mi: + case X86::MOV64mi32: + case X86::MOV8mr: + case X86::MOV16mr: + case X86::MOV32mr: + case X86::MOV64mr: + return true; + default: + return false; + } +}; + +static int64_t getStackOffset(X86FrameLowering const &FL, + const MachineInstr &MemOp) { + const MCInstrDesc &Desc = MemOp.getDesc(); + int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags); + if (MemRefBegin < 0) + return false; + + MemRefBegin += X86II::getOperandBias(Desc); + + const MachineOperand &BaseOp = + MemOp.getOperand(MemRefBegin + X86::AddrBaseReg); + if (!BaseOp.isFI()) + return -1; + + if (MemOp.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1) + return -1; + + if (MemOp.getOperand(MemRefBegin + X86::AddrIndexReg).getReg() != + X86::NoRegister) + return -1; + + const MachineOperand &DispMO = MemOp.getOperand(MemRefBegin + X86::AddrDisp); + + // Displacement can be symbolic + if (!DispMO.isImm()) + return -1; + + int FrameIndex = BaseOp.getIndex(); + + MachineFunction const &MF = *MemOp.getParent()->getParent(); + unsigned FR; + return FL.getFrameIndexReferencePreferSP(MF, FrameIndex, FR, true) + + DispMO.getImm(); +} + +void X86FrameLowering::FreeStackProbe::shiftBy(int64_t ShiftValue) { + MachineInstr &FreeProbe = *MBBI; + if (FreeProbe.isCall()) + return; + + MachineInstr &MemOp = FreeProbe; + + const MCInstrDesc &Desc = MemOp.getDesc(); + int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags); + assert(MemRefBegin >= 0 && "no memory operand"); + + MemRefBegin += X86II::getOperandBias(Desc); + + const MachineOperand &BaseOp = + MemOp.getOperand(MemRefBegin + X86::AddrBaseReg); + assert(BaseOp.isFI() && "frame object"); + MachineOperand &DispMO = MemOp.getOperand(MemRefBegin + X86::AddrDisp); + assert(DispMO.isImm() && "displacement is symbolic"); + + DispMO.setImm(DispMO.getImm() - ShiftValue); + FreeProbe.setFlag(MachineInstr::FrameSetup); +} + +/// findFreeStackProbe - traverse function from @p MBBI to the end of @p MBB, +/// stopping when either of the following happens first: +/// +/// - a MachineInstruction writing to the stack at an offset statically known +/// to be in [0, MaxOffset[ is found, and return an iterator to that +/// instruction and the offset of the Probe, (ProbeIterator, ProbeOffset) +/// - a MachineInstruction reading or writing to the stack at an offset not +/// statically known is found, and return (MBBI.end(), -1). +/// - none of the above happen, and return (MBBI.end(), -1). +X86FrameLowering::FreeStackProbe X86FrameLowering::findFreeStackProbe( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, int64_t StackSize, + int64_t MinOffset, int64_t MaxOffset) const { + + for (; MBBI != MBB.end(); ++MBBI) { + MachineInstr &MI = *MBBI; + if (MI.isCall()) { + return {MBBI, MaxOffset}; + } + + if (isUnconditionalMove(MI.getOpcode())) { + int64_t StackOffset = StackSize - getStackOffset(*this, MI); + if (StackOffset >= 0) { + if (MinOffset <= StackOffset && StackOffset < MaxOffset) { + return {MBBI, StackOffset}; + } else { + // writing too far from MaxOffset + return {}; + } + } + // don't know where we write, stopping + break; + } + if (std::any_of(MI.operands_begin(), MI.operands_end(), + [](MachineOperand &MO) { return MO.isFI(); })) { + break; // effect on stack pointer not modelled, stopping + } + } + return {}; +} + /// emitSPUpdate - Emit a series of instructions to increment / decrement the /// stack pointer by a constant value. void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, @@ -257,7 +381,27 @@ uint64_t Chunk = (1LL << 31) - 1; - if (Offset > Chunk) { + MachineFunction &MF = *MBB.getParent(); + const X86Subtarget &STI = MF.getSubtarget(); + const X86TargetLowering &TLI = *STI.getTargetLowering(); + const bool EmitInlineStackProbe = TLI.hasInlineStackProbe(MF); + + // It's ok to not take into account large chunks when probing, as the + // allocation is split in smaller chunks anyway. + if (EmitInlineStackProbe && !InEpilogue) { + + // stack probing may involve looping, and control flow generations is + // disallowed at this point. Rely to later processing through + // `inlineStackProbe`. + MachineInstr *Stub = emitStackProbeInlineStub(MF, MBB, MBBI, DL, true); + + // Encode the static offset as a metadata attached to the stub. + LLVMContext &Context = MF.getFunction().getContext(); + MachineInstrBuilder(MF, Stub).addMetadata( + MDTuple::get(Context, {ConstantAsMetadata::get(ConstantInt::get( + IntegerType::get(Context, 64), Offset))})); + return; + } else if (Offset > Chunk) { // Rather than emit a long series of instructions for large offsets, // load the offset into a register and do one sub/add unsigned Reg = 0; @@ -381,8 +525,8 @@ } else { bool IsSub = Offset < 0; uint64_t AbsOffset = IsSub ? -Offset : Offset; - unsigned Opc = IsSub ? getSUBriOpcode(Uses64BitFramePtr, AbsOffset) - : getADDriOpcode(Uses64BitFramePtr, AbsOffset); + const unsigned Opc = IsSub ? getSUBriOpcode(Uses64BitFramePtr, AbsOffset) + : getADDriOpcode(Uses64BitFramePtr, AbsOffset); MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) .addReg(StackPtr) .addImm(AbsOffset); @@ -528,6 +672,183 @@ const DebugLoc &DL, bool InProlog) const { const X86Subtarget &STI = MF.getSubtarget(); + if (STI.isTargetWindowsCoreCLR() && STI.is64Bit()) + emitStackProbeInlineWindowsCoreCLR64(MF, MBB, MBBI, DL, InProlog); + else + emitStackProbeInlineGeneric(MF, MBB, MBBI, DL, InProlog); +} + +void X86FrameLowering::emitStackProbeInlineGeneric( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const { + MachineInstr &CallToInline = *std::prev(MBBI); + assert(CallToInline.getOperand(1).isMetadata() && + "no metadata attached to that probe"); + uint64_t Offset = + cast( + cast( + cast(CallToInline.getOperand(1).getMetadata()) + ->getOperand(0)) + ->getValue()) + ->getZExtValue(); + + const X86Subtarget &STI = MF.getSubtarget(); + const X86TargetLowering &TLI = *STI.getTargetLowering(); + assert(!(STI.is64Bit() && STI.isTargetWindowsCoreCLR()) && + "different expansion expected for CoreCLR 64 bit"); + + const uint64_t StackProbeSize = TLI.getStackProbeSize(MF); + uint64_t ProbeChunk = StackProbeSize * 8; + + // Synthesize a loop or unroll it, depending on the number of iterations. + if (Offset > ProbeChunk) { + emitStackProbeInlineGenericLoop(MF, MBB, MBBI, DL, Offset); + } else { + emitStackProbeInlineGenericBlock(MF, MBB, MBBI, DL, Offset); + } +} + +void X86FrameLowering::emitStackProbeInlineGenericBlock( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, + uint64_t Offset) const { + + const X86Subtarget &STI = MF.getSubtarget(); + const X86TargetLowering &TLI = *STI.getTargetLowering(); + const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset); + const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi; + const uint64_t StackProbeSize = TLI.getStackProbeSize(MF); + uint64_t CurrentOffset = 0; + // 0 Thanks to return address being saved on the stack + uint64_t CurrentProbeOffset = 0; + + while (CurrentOffset < Offset) { + uint64_t ChunkSize = std::min(Offset - CurrentOffset, StackProbeSize); + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) + .addReg(StackPtr) + .addImm(ChunkSize) + .setMIFlag(MachineInstr::FrameSetup); + CurrentOffset += ChunkSize; + MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. + + // We need to touch the stack allocated by MI. Instead of generating + // a probe, we try to reuse existing stack operation available in the + // EntryBlock, starting from MI. findFreeStackProbe gives us such a + // probe, if any and continuing allocation from that probe doesn't + // conflict with instructions between MI and the free stack probe. + + const int64_t FreeProbeLowerBound = CurrentOffset - ChunkSize; + const int64_t FreeProbeUpperBound = CurrentOffset; + while (FreeStackProbe FreeProbe = findFreeStackProbe( + MBB, MBBI, Offset, FreeProbeLowerBound, FreeProbeUpperBound)) { + NumFrameFreeProbe++; + CurrentProbeOffset = FreeProbe.getOffset(); + FreeProbe.shiftBy(Offset - FreeProbeUpperBound); + MBBI = std::next(FreeProbe.getIterator()); + + // We found the best probe we could find, stop here in case + // another perfect probe shows up: we could use it for another page. + if (FreeProbe.getOffset() == FreeProbeUpperBound) + break; + } + + while (CurrentOffset - CurrentProbeOffset >= StackProbeSize) { + NumFrameExtraProbe++; + auto Shift = std::min(CurrentOffset - CurrentProbeOffset, StackProbeSize); + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc)) + .setMIFlag(MachineInstr::FrameSetup), + StackPtr, false, StackProbeSize - Shift) + .addImm(0) + .setMIFlag(MachineInstr::FrameSetup); + CurrentProbeOffset += Shift; + } + } +} + +void X86FrameLowering::emitStackProbeInlineGenericLoop( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, + uint64_t Offset) const { + + const X86Subtarget &STI = MF.getSubtarget(); + const X86TargetLowering &TLI = *STI.getTargetLowering(); + const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi; + const uint64_t StackProbeSize = TLI.getStackProbeSize(MF); + + // Synthesize a loop + NumFrameLoopProbe++; + const BasicBlock *LLVM_BB = MBB.getBasicBlock(); + + MachineBasicBlock *testMBB = MF.CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *tailMBB = MF.CreateMachineBasicBlock(LLVM_BB); + + MachineFunction::iterator MBBIter = ++MBB.getIterator(); + MF.insert(MBBIter, testMBB); + MF.insert(MBBIter, tailMBB); + + unsigned FinalStackPtr = Uses64BitFramePtr ? X86::R11 : X86::R11D; + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FinalStackPtr) + .addReg(StackPtr) + .setMIFlag(MachineInstr::FrameSetup); + + // save loop bound + { + const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset); + BuildMI(MBB, MBBI, DL, TII.get(Opc), FinalStackPtr) + .addReg(FinalStackPtr) + .addImm(Offset / StackProbeSize * StackProbeSize) + .setMIFlag(MachineInstr::FrameSetup); + } + + // allocate a page + { + const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, StackProbeSize); + BuildMI(testMBB, DL, TII.get(Opc), StackPtr) + .addReg(StackPtr) + .addImm(StackProbeSize) + .setMIFlag(MachineInstr::FrameSetup); + } + + // touch the page + addRegOffset(BuildMI(testMBB, DL, TII.get(MovMIOpc)) + .setMIFlag(MachineInstr::FrameSetup), + StackPtr, false, 0) + .addImm(0) + .setMIFlag(MachineInstr::FrameSetup); + + // cmp with stack pointer bound + BuildMI(testMBB, DL, TII.get(IsLP64 ? X86::CMP64rr : X86::CMP32rr)) + .addReg(StackPtr) + .addReg(FinalStackPtr) + .setMIFlag(MachineInstr::FrameSetup); + + // jump + BuildMI(testMBB, DL, TII.get(X86::JCC_1)) + .addMBB(testMBB) + .addImm(X86::COND_NE) + .setMIFlag(MachineInstr::FrameSetup); + testMBB->addSuccessor(testMBB); + testMBB->addSuccessor(tailMBB); + + // allocate a block and touch it + + tailMBB->splice(tailMBB->end(), &MBB, MBBI, MBB.end()); + tailMBB->transferSuccessorsAndUpdatePHIs(&MBB); + MBB.addSuccessor(testMBB); + + if (Offset % StackProbeSize) { + const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset); + BuildMI(*tailMBB, tailMBB->begin(), DL, TII.get(Opc), StackPtr) + .addReg(StackPtr) + .addImm(Offset % StackProbeSize) + .setMIFlag(MachineInstr::FrameSetup); + } +} + +void X86FrameLowering::emitStackProbeInlineWindowsCoreCLR64( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const { + const X86Subtarget &STI = MF.getSubtarget(); assert(STI.is64Bit() && "different expansion needed for 32 bit"); assert(STI.isTargetWindowsCoreCLR() && "custom expansion expects CoreCLR"); const TargetInstrInfo &TII = *STI.getInstrInfo(); @@ -821,13 +1142,13 @@ } } -void X86FrameLowering::emitStackProbeInlineStub( +MachineInstr *X86FrameLowering::emitStackProbeInlineStub( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const { assert(InProlog && "ChkStkStub called outside prolog!"); - BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32)) + return BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32)) .addExternalSymbol("__chkstk_stub"); } @@ -1015,7 +1336,8 @@ X86FI->setCalleeSavedFrameSize( X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta); - bool UseStackProbe = !STI.getTargetLowering()->getStackProbeSymbolName(MF).empty(); + const bool EmitStackProbeCall = + STI.getTargetLowering()->hasStackProbeSymbol(MF); unsigned StackProbeSize = STI.getTargetLowering()->getStackProbeSize(MF); // Re-align the stack on 64-bit if the x86-interrupt calling convention is @@ -1033,11 +1355,10 @@ // pointer, calls, or dynamic alloca then we do not need to adjust the // stack pointer (we fit in the Red Zone). We also check that we don't // push and pop from the stack. - if (has128ByteRedZone(MF) && - !TRI->needsStackRealignment(MF) && + if (has128ByteRedZone(MF) && !TRI->needsStackRealignment(MF) && !MFI.hasVarSizedObjects() && // No dynamic alloca. !MFI.adjustsStack() && // No calls. - !UseStackProbe && // No stack probes. + !EmitStackProbeCall && // No stack probes. !MFI.hasCopyImplyingStackAdjustment() && // Don't push and pop. !MF.shouldSplitStack()) { // Regular stack uint64_t MinSize = X86FI->getCalleeSavedFrameSize(); @@ -1238,7 +1559,7 @@ uint64_t AlignedNumBytes = NumBytes; if (IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF)) AlignedNumBytes = alignTo(AlignedNumBytes, MaxAlign); - if (AlignedNumBytes >= StackProbeSize && UseStackProbe) { + if (AlignedNumBytes >= StackProbeSize && EmitStackProbeCall) { assert(!X86FI->getUsesRedZone() && "The Red Zone is not accounted for in stack probes"); diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -537,6 +537,10 @@ // falls back to heap allocation if not. SEG_ALLOCA, + // For allocating stack space when using stack clash protector. + // Allocation is performed by block, and each block is probed. + PROBED_ALLOCA, + // Memory barriers. MEMBARRIER, MFENCE, @@ -1215,6 +1219,8 @@ bool supportSwiftError() const override; + bool hasStackProbeSymbol(MachineFunction &MF) const override; + bool hasInlineStackProbe(MachineFunction &MF) const override; StringRef getStackProbeSymbolName(MachineFunction &MF) const override; unsigned getStackProbeSize(MachineFunction &MF) const; @@ -1439,6 +1445,9 @@ MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr &MI, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredProbedAlloca(MachineInstr &MI, + MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredTLSAddr(MachineInstr &MI, MachineBasicBlock *BB) const; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -22144,9 +22144,9 @@ SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); bool SplitStack = MF.shouldSplitStack(); - bool EmitStackProbe = !getStackProbeSymbolName(MF).empty(); + bool EmitStackProbeCall = hasStackProbeSymbol(MF); bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) || - SplitStack || EmitStackProbe; + SplitStack || EmitStackProbeCall; SDLoc dl(Op); // Get the inputs. @@ -22170,11 +22170,21 @@ assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and" " not tell us which reg is the stack pointer!"); - SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); - Chain = SP.getValue(1); const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); unsigned StackAlign = TFI.getStackAlignment(); - Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value + if (hasInlineStackProbe(MF)) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + + const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy); + Register Vreg = MRI.createVirtualRegister(AddrRegClass); + Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); + Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain, + DAG.getRegister(Vreg, SPTy)); + } else { + SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); + Chain = SP.getValue(1); + Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value + } if (Align > StackAlign) Result = DAG.getNode(ISD::AND, dl, VT, Result, DAG.getConstant(-(uint64_t)Align, dl, VT)); @@ -28702,6 +28712,8 @@ case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; case X86ISD::MFENCE: return "X86ISD::MFENCE"; case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; + case X86ISD::PROBED_ALLOCA: + return "X86ISD::PROBED_ALLOCA"; case X86ISD::SAHF: return "X86ISD::SAHF"; case X86ISD::RDRAND: return "X86ISD::RDRAND"; case X86ISD::RDSEED: return "X86ISD::RDSEED"; @@ -29959,6 +29971,95 @@ } MachineBasicBlock * +X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI, + MachineBasicBlock *BB) const { + MachineFunction *MF = BB->getParent(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + + const bool Is64Bit = Subtarget.is64Bit(); + const bool IsLP64 = Subtarget.isTarget64BitLP64(); + + const unsigned ProbeSize = getStackProbeSize(*MF); + + MachineRegisterInfo &MRI = MF->getRegInfo(); + MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB); + + MachineFunction::iterator MBBIter = ++BB->getIterator(); + MF->insert(MBBIter, testMBB); + MF->insert(MBBIter, blockMBB); + MF->insert(MBBIter, tailMBB); + + unsigned sizeVReg = MI.getOperand(1).getReg(); + + const TargetRegisterClass *SizeRegClass = MRI.getRegClass(sizeVReg); + + unsigned tmpSizeVReg = MRI.createVirtualRegister(SizeRegClass); + unsigned tmpSizeVReg2 = MRI.createVirtualRegister(SizeRegClass); + + unsigned physSPReg = + IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP; + + // test rsp size + BuildMI(testMBB, DL, TII->get(X86::PHI), tmpSizeVReg) + .addReg(sizeVReg) + .addMBB(BB) + .addReg(tmpSizeVReg2) + .addMBB(blockMBB); + + BuildMI(testMBB, DL, TII->get(IsLP64 ? X86::CMP64ri32 : X86::CMP32ri)) + .addReg(tmpSizeVReg) + .addImm(ProbeSize); + + BuildMI(testMBB, DL, TII->get(X86::JCC_1)) + .addMBB(tailMBB) + .addImm(X86::COND_L); + testMBB->addSuccessor(blockMBB); + testMBB->addSuccessor(tailMBB); + + // allocate a block and touch it + + BuildMI(blockMBB, DL, TII->get(IsLP64 ? X86::SUB64ri32 : X86::SUB32ri), + tmpSizeVReg2) + .addReg(tmpSizeVReg) + .addImm(ProbeSize); + + BuildMI(blockMBB, DL, TII->get(IsLP64 ? X86::SUB64ri32 : X86::SUB32ri), + physSPReg) + .addReg(physSPReg) + .addImm(ProbeSize); + + const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi; + addRegOffset(BuildMI(blockMBB, DL, TII->get(MovMIOpc)), physSPReg, false, 0) + .addImm(0); + + BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB); + blockMBB->addSuccessor(testMBB); + + // allocate the tail and continue + BuildMI(tailMBB, DL, TII->get(IsLP64 ? X86::SUB64rr : X86::SUB32rr), + physSPReg) + .addReg(physSPReg) + .addReg(tmpSizeVReg); + BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg()) + .addReg(physSPReg); + + tailMBB->splice(tailMBB->end(), BB, + std::next(MachineBasicBlock::iterator(MI)), BB->end()); + tailMBB->transferSuccessorsAndUpdatePHIs(BB); + BB->addSuccessor(testMBB); + + // Delete the original pseudo instruction. + MI.eraseFromParent(); + + // And we're done. + return tailMBB; +} + +MachineBasicBlock * X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); @@ -31133,6 +31234,9 @@ case X86::SEG_ALLOCA_32: case X86::SEG_ALLOCA_64: return EmitLoweredSegAlloca(MI, BB); + case X86::PROBED_ALLOCA_32: + case X86::PROBED_ALLOCA_64: + return EmitLoweredProbedAlloca(MI, BB); case X86::TLSCall_32: case X86::TLSCall_64: return EmitLoweredTLSCall(MI, BB); @@ -45822,10 +45926,35 @@ return Subtarget.is64Bit(); } +/// Returns true if stack probing through a function call is requested. +bool X86TargetLowering::hasStackProbeSymbol(MachineFunction &MF) const { + return !getStackProbeSymbolName(MF).empty(); +} + +/// Returns true if stack probing through inline assembly is requested. +bool X86TargetLowering::hasInlineStackProbe(MachineFunction &MF) const { + + // No inline stack probe for Windows, they have their own mechanism. + if (Subtarget.isOSWindows() || + MF.getFunction().hasFnAttribute("no-stack-arg-probe")) + return false; + + // If the function specifically requests inline stack probes, emit them. + if (MF.getFunction().hasFnAttribute("probe-stack")) + return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() == + "inline-asm"; + + return false; +} + /// Returns the name of the symbol used to emit stack probes or the empty /// string if not applicable. StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const { + // Inline Stack probes disable stack probe call + if (hasInlineStackProbe(MF)) + return ""; + // If the function specifically requests stack probes, emit them. if (MF.getFunction().hasFnAttribute("probe-stack")) return MF.getFunction().getFnAttribute("probe-stack").getValueAsString(); diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -111,6 +111,23 @@ [(set GR64:$dst, (X86SegAlloca GR64:$size))]>, Requires<[In64BitMode]>; + +// To protect against stack clash, dynamic allocation should perform a memory +// probe at each page. + +let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in +def PROBED_ALLOCA_32 : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$size), + "# variable sized alloca with probing", + [(set GR32:$dst, + (X86ProbedAlloca GR32:$size))]>, + Requires<[NotLP64]>; + +let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in +def PROBED_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size), + "# variable sized alloca with probing", + [(set GR64:$dst, + (X86ProbedAlloca GR64:$size))]>, + Requires<[In64BitMode]>; } // Dynamic stack allocation yields a _chkstk or _alloca call for all Windows diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td --- a/llvm/lib/Target/X86/X86InstrInfo.td +++ b/llvm/lib/Target/X86/X86InstrInfo.td @@ -121,6 +121,8 @@ def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>; +def SDT_X86PROBED_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>; + def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>; def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>; @@ -292,6 +294,9 @@ def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA, [SDNPHasChain]>; +def X86ProbedAlloca : SDNode<"X86ISD::PROBED_ALLOCA", SDT_X86PROBED_ALLOCA, + [SDNPHasChain]>; + def X86TLSCall : SDNode<"X86ISD::TLSCALL", SDT_X86TLSCALL, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; diff --git a/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll b/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll @@ -0,0 +1,44 @@ +; RUN: llc < %s | FileCheck %s + + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @foo(i32 %n) local_unnamed_addr #0 { + +; CHECK-LABEL: foo: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: leaq 15(,%rax,4), %rax +; CHECK-NEXT: andq $-16, %rax +; CHECK-NEXT: cmpq $4096, %rax # imm = 0x1000 +; CHECK-NEXT: jl .LBB0_3 +; CHECK-NEXT: .LBB0_2: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: subq $4096, %rax # imm = 0x1000 +; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-NEXT: movq $0, (%rsp) +; CHECK-NEXT: cmpq $4096, %rax # imm = 0x1000 +; CHECK-NEXT: jge .LBB0_2 +; CHECK-NEXT: .LBB0_3: +; CHECK-NEXT: subq %rax, %rsp +; CHECK-NEXT: movq %rsp, %rax +; CHECK-NEXT: movl $1, 4792(%rax) +; CHECK-NEXT: movl (%rax), %eax +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: retq + + %a = alloca i32, i32 %n, align 16 + %b = getelementptr inbounds i32, i32* %a, i64 1198 + store volatile i32 1, i32* %b + %c = load volatile i32, i32* %a + ret i32 %c +} + +attributes #0 = {"probe-stack"="inline-asm"} diff --git a/llvm/test/CodeGen/X86/stack-clash-large.ll b/llvm/test/CodeGen/X86/stack-clash-large.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/stack-clash-large.ll @@ -0,0 +1,38 @@ +; RUN: llc < %s | FileCheck %s + + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @foo() local_unnamed_addr #0 { + +; CHECK-LABEL: foo: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rsp, %r11 +; CHECK-NEXT: subq $69632, %r11 # imm = 0x11000 +; CHECK-NEXT: .LBB0_1: +; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-NEXT: movq $0, (%rsp) +; CHECK-NEXT: cmpq %r11, %rsp +; CHECK-NEXT: jne .LBB0_1 +; CHECK-NEXT:# %bb.2: +; CHECK-NEXT: subq $2248, %rsp # imm = 0x8C8 +; CHECK-NEXT: .cfi_def_cfa_offset 71888 +; CHECK-NEXT: movl $1, 264(%rsp) +; CHECK-NEXT: movl $1, 28664(%rsp) +; CHECK-NEXT: movl -128(%rsp), %eax +; CHECK-NEXT: addq $71880, %rsp # imm = 0x118C8 +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + + + %a = alloca i32, i64 18000, align 16 + %b0 = getelementptr inbounds i32, i32* %a, i64 98 + %b1 = getelementptr inbounds i32, i32* %a, i64 7198 + store volatile i32 1, i32* %b0 + store volatile i32 1, i32* %b1 + %c = load volatile i32, i32* %a + ret i32 %c +} + +attributes #0 = {"probe-stack"="inline-asm"} diff --git a/llvm/test/CodeGen/X86/stack-clash-medium-natural-probes-mutliple-objects.ll b/llvm/test/CodeGen/X86/stack-clash-medium-natural-probes-mutliple-objects.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/stack-clash-medium-natural-probes-mutliple-objects.ll @@ -0,0 +1,32 @@ +; RUN: llc < %s | FileCheck %s + + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @foo() local_unnamed_addr #0 { + +; CHECK-LABEL: foo: +; CHECK: # %bb.0: +; CHECK-NEXT subq $4096, %rsp # imm = 0x1000 +; CHECK-NEXT .cfi_def_cfa_offset 5888 +; CHECK-NEXT movl $1, 2088(%rsp) +; CHECK-NEXT subq $1784, %rsp # imm = 0x6F8 +; CHECK-NEXT movl $2, 672(%rsp) +; CHECK-NEXT movl 1872(%rsp), %eax +; CHECK-NEXT addq $5880, %rsp # imm = 0x16F8 +; CHECK-NEXT .cfi_def_cfa_offset 8 +; CHECK-NEXT retq + + + %a = alloca i32, i64 1000, align 16 + %b = alloca i32, i64 500, align 16 + %a0 = getelementptr inbounds i32, i32* %a, i64 500 + %b0 = getelementptr inbounds i32, i32* %b, i64 200 + store volatile i32 1, i32* %a0 + store volatile i32 2, i32* %b0 + %c = load volatile i32, i32* %a + ret i32 %c +} + +attributes #0 = {"probe-stack"="inline-asm"} diff --git a/llvm/test/CodeGen/X86/stack-clash-medium-natural-probes.ll b/llvm/test/CodeGen/X86/stack-clash-medium-natural-probes.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/stack-clash-medium-natural-probes.ll @@ -0,0 +1,32 @@ +; RUN: llc < %s | FileCheck %s + + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @foo() local_unnamed_addr #0 { + +; CHECK-LABEL: foo: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-NEXT: movq $0, (%rsp) +; CHECK-NEXT: subq $3784, %rsp # imm = 0xEC8 +; CHECK-NEXT: .cfi_def_cfa_offset 7888 +; CHECK-NEXT: movl $1, 264(%rsp) +; CHECK-NEXT: movl $1, 4664(%rsp) +; CHECK-NEXT: movl -128(%rsp), %eax +; CHECK-NEXT: addq $7880, %rsp # imm = 0x1EC8 +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + + + %a = alloca i32, i64 2000, align 16 + %b0 = getelementptr inbounds i32, i32* %a, i64 98 + %b1 = getelementptr inbounds i32, i32* %a, i64 1198 + store i32 1, i32* %b0 + store i32 1, i32* %b1 + %c = load volatile i32, i32* %a + ret i32 %c +} + +attributes #0 = {"probe-stack"="inline-asm"} diff --git a/llvm/test/CodeGen/X86/stack-clash-medium.ll b/llvm/test/CodeGen/X86/stack-clash-medium.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/stack-clash-medium.ll @@ -0,0 +1,27 @@ +; RUN: llc < %s | FileCheck %s + + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @foo() local_unnamed_addr #0 { + +; CHECK-LABEL: foo: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-NEXT: .cfi_def_cfa_offset 7888 +; CHECK-NEXT: movl $1, 3288(%rsp) +; CHECK-NEXT: subq $3784, %rsp # imm = 0xEC8 +; CHECK-NEXT: movq $0, (%rsp) +; CHECK-NEXT: movl -128(%rsp), %eax +; CHECK-NEXT: addq $7880, %rsp # imm = 0x1EC8 + + + %a = alloca i32, i64 2000, align 16 + %b = getelementptr inbounds i32, i32* %a, i64 1800 + store volatile i32 1, i32* %b + %c = load volatile i32, i32* %a + ret i32 %c +} + +attributes #0 = {"probe-stack"="inline-asm"} diff --git a/llvm/test/CodeGen/X86/stack-clash-no-free-probe.ll b/llvm/test/CodeGen/X86/stack-clash-no-free-probe.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/stack-clash-no-free-probe.ll @@ -0,0 +1,27 @@ +; RUN: llc < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @foo(i64 %i) local_unnamed_addr #0 { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-NEXT: movq $0, (%rsp) +; CHECK-NEXT: subq $3784, %rsp # imm = 0xEC8 +; CHECK-NEXT: .cfi_def_cfa_offset 7888 +; CHECK-NEXT: movl $1, -128(%rsp,%rdi,4) +; CHECK-NEXT: movl -128(%rsp), %eax +; CHECK-NEXT: addq $7880, %rsp # imm = 0x1EC8 +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + + %a = alloca i32, i32 2000, align 16 + %b = getelementptr inbounds i32, i32* %a, i64 %i + store volatile i32 1, i32* %b + %c = load volatile i32, i32* %a + ret i32 %c +} + +attributes #0 = {"probe-stack"="inline-asm"} + diff --git a/llvm/test/CodeGen/X86/stack-clash-small.ll b/llvm/test/CodeGen/X86/stack-clash-small.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/stack-clash-small.ll @@ -0,0 +1,25 @@ +; RUN: llc < %s | FileCheck %s + + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @foo() local_unnamed_addr #0 { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $280, %rsp # imm = 0x118 +; CHECK-NEXT: .cfi_def_cfa_offset 288 +; CHECK-NEXT: movl $1, 264(%rsp) +; CHECK-NEXT: movl -128(%rsp), %eax +; CHECK-NEXT: addq $280, %rsp # imm = 0x118 +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + + %a = alloca i32, i64 100, align 16 + %b = getelementptr inbounds i32, i32* %a, i64 98 + store volatile i32 1, i32* %b + %c = load volatile i32, i32* %a + ret i32 %c +} + +attributes #0 = {"probe-stack"="inline-asm"} diff --git a/llvm/test/CodeGen/X86/stack-clash-unknown-call.ll b/llvm/test/CodeGen/X86/stack-clash-unknown-call.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/stack-clash-unknown-call.ll @@ -0,0 +1,31 @@ +; RUN: llc < %s | FileCheck %s + + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg); + +define void @foo() local_unnamed_addr #0 { + +;CHECK-LABEL: foo: +;CHECK: # %bb.0: +;CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 +; it's important that we don't use the call as a probe here +;CHECK-NEXT: movq $0, (%rsp) +;CHECK-NEXT: subq $3912, %rsp # imm = 0xF48 +;CHECK-NEXT: .cfi_def_cfa_offset 8016 +;CHECK-NEXT: movq %rsp, %rdi +;CHECK-NEXT: movl $8000, %edx # imm = 0x1F40 +;CHECK-NEXT: xorl %esi, %esi +;CHECK-NEXT: callq memset +;CHECK-NEXT: addq $8008, %rsp # imm = 0x1F48 +;CHECK-NEXT: .cfi_def_cfa_offset 8 +;CHECK-NEXT: retq + + %a = alloca i8, i64 8000, align 16 + call void @llvm.memset.p0i8.i64(i8* align 16 %a, i8 0, i64 8000, i1 false) + ret void +} + +attributes #0 = {"probe-stack"="inline-asm"}