diff --git a/clang/docs/ClangCommandLineReference.rst b/clang/docs/ClangCommandLineReference.rst --- a/clang/docs/ClangCommandLineReference.rst +++ b/clang/docs/ClangCommandLineReference.rst @@ -1911,6 +1911,10 @@ Emit section containing metadata on function stack sizes +.. option:: -fstack-clash-protection, -fno-stack-clash-protection + +Instrument stack allocation to prevent stack clash attacks (x86, non-Windows only). + .. option:: -fstandalone-debug, -fno-limit-debug-info, -fno-standalone-debug Emit full debug info for all types used by the program diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -61,6 +61,10 @@ ------------------ +- -fstack-clash-protection will provide a protection against the stack clash + attack for x86 architecture through automatic probing of each page of + allocated stack. + Deprecated Compiler Flags ------------------------- diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -149,6 +149,7 @@ CODEGENOPT(EnableSegmentedStacks , 1, 0) ///< Set when -fsplit-stack is enabled. CODEGENOPT(NoInlineLineTables, 1, 0) ///< Whether debug info should contain ///< inline line tables. +CODEGENOPT(StackClashProtector, 1, 0) ///< Set when -fstack-clash-protection is enabled. CODEGENOPT(NoImplicitFloat , 1, 0) ///< Set when -mno-implicit-float is enabled. CODEGENOPT(NoInfsFPMath , 1, 0) ///< Assume FP arguments, results not +-Inf. CODEGENOPT(NoSignedZeros , 1, 0) ///< Allow ignoring the signedness of FP zero diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td b/clang/include/clang/Basic/DiagnosticCommonKinds.td --- a/clang/include/clang/Basic/DiagnosticCommonKinds.td +++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td @@ -239,6 +239,9 @@ let CategoryName = "Inline Assembly Issue" in { def err_asm_invalid_type_in_input : Error< "invalid type %0 in asm input for constraint '%1'">; + + def warn_stack_clash_protection_inline_asm : Warning< + "Unable to protect inline asm that clobbers stack pointer against stack clash">; } // Sema && Serialization diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h --- a/clang/include/clang/Basic/TargetInfo.h +++ b/clang/include/clang/Basic/TargetInfo.h @@ -816,6 +816,8 @@ StringRef getNormalizedGCCRegisterName(StringRef Name, bool ReturnCanonical = false) const; + virtual bool isSPRegName(StringRef) const { return false; } + /// Extracts a register from the passed constraint (if it is a /// single-register constraint) and the asm label expression related to a /// variable in the input or output list of an inline asm statement. diff --git a/clang/include/clang/Driver/CC1Options.td b/clang/include/clang/Driver/CC1Options.td --- a/clang/include/clang/Driver/CC1Options.td +++ b/clang/include/clang/Driver/CC1Options.td @@ -741,6 +741,8 @@ HelpText<"Enable stack protectors">; def stack_protector_buffer_size : Separate<["-"], "stack-protector-buffer-size">, HelpText<"Lower bound for a buffer to be considered for stack protection">; +def stack_clash_protection : Separate<["-"], "stack-clash-protection">, + HelpText<"Enable stack clash protection">; def fvisibility : Separate<["-"], "fvisibility">, HelpText<"Default type and symbol visibility">; def ftype_visibility : Separate<["-"], "ftype-visibility">, diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1770,6 +1770,10 @@ def fsplit_stack : Flag<["-"], "fsplit-stack">, Group; def fstack_protector_all : Flag<["-"], "fstack-protector-all">, Group, HelpText<"Enable stack protectors for all functions">; +def fstack_clash_protection : Flag<["-"], "fstack-clash-protection">, Group, + HelpText<"Enable stack clash protection">; +def fnostack_clash_protection : Flag<["-"], "fnostack-clash-protection">, Group, + HelpText<"Disable stack clash protection">; def fstack_protector_strong : Flag<["-"], "fstack-protector-strong">, Group, HelpText<"Enable stack protectors for some functions vulnerable to stack smashing. " "Compared to -fstack-protector, this uses a stronger heuristic " diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -166,6 +166,10 @@ ArrayRef getGCCAddlRegNames() const override; + bool isSPRegName(StringRef RegName) const override { + return RegName.equals("esp") || RegName.equals("rsp"); + } + bool validateCpuSupports(StringRef Name) const override; bool validateCpuIs(StringRef Name) const override; diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp --- a/clang/lib/CodeGen/CGStmt.cpp +++ b/clang/lib/CodeGen/CGStmt.cpp @@ -2247,8 +2247,14 @@ if (Clobber == "memory") ReadOnly = ReadNone = false; - else if (Clobber != "cc") + else if (Clobber != "cc") { Clobber = getTarget().getNormalizedGCCRegisterName(Clobber); + if (CGM.getCodeGenOpts().StackClashProtector && + getTarget().isSPRegName(Clobber)) { + CGM.getDiags().Report(S.getAsmLoc(), + diag::warn_stack_clash_protection_inline_asm); + } + } if (!Constraints.empty()) Constraints += ','; diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -1494,6 +1494,9 @@ if (CodeGenOpts.UnwindTables) B.addAttribute(llvm::Attribute::UWTable); + if (CodeGenOpts.StackClashProtector) + B.addAttribute("probe-stack", "inline-asm"); + if (!hasUnwindExceptions(LangOpts)) B.addAttribute(llvm::Attribute::NoUnwind); diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -2989,6 +2989,21 @@ } } +static void RenderSCPOptions(const ToolChain &TC, const ArgList &Args, + ArgStringList &CmdArgs) { + const llvm::Triple &EffectiveTriple = TC.getEffectiveTriple(); + + if (!EffectiveTriple.isOSLinux()) + return; + + if (!EffectiveTriple.isX86()) + return; + + if (Args.hasFlag(options::OPT_fstack_clash_protection, + options::OPT_fnostack_clash_protection, false)) + CmdArgs.push_back("-stack-clash-protection"); +} + static void RenderTrivialAutoVarInitOptions(const Driver &D, const ToolChain &TC, const ArgList &Args, @@ -5203,6 +5218,7 @@ CmdArgs.push_back(Args.MakeArgString("-mspeculative-load-hardening")); RenderSSPOptions(TC, Args, CmdArgs, KernelOrKext); + RenderSCPOptions(TC, Args, CmdArgs); RenderTrivialAutoVarInitOptions(D, TC, Args, CmdArgs); // Translate -mstackrealign diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -1224,6 +1224,8 @@ Opts.NoStackArgProbe = Args.hasArg(OPT_mno_stack_arg_probe); + Opts.StackClashProtector = Args.hasArg(OPT_stack_clash_protection); + if (Arg *A = Args.getLastArg(OPT_fobjc_dispatch_method_EQ)) { StringRef Name = A->getValue(); unsigned Method = llvm::StringSwitch(Name) diff --git a/clang/test/CodeGen/stack-clash-protection.c b/clang/test/CodeGen/stack-clash-protection.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/stack-clash-protection.c @@ -0,0 +1,34 @@ +// check interaction between -fstack-clash-protection and dynamic allocation schemes +// RUN: %clang -target x86_64 -O0 -o %t.out %s -fstack-clash-protection && %t.out + +#include +#include + +int large_stack() __attribute__((noinline)); + +int large_stack() { + int stack[20000], i; + for (i = 0; i < sizeof(stack) / sizeof(int); ++i) + stack[i] = i; + return stack[1]; +} + +int main(int argc, char **argv) { + int volatile static_mem[8000]; + for (size_t i = 0; i < argc * sizeof(static_mem) / sizeof(static_mem[0]); ++i) + static_mem[i] = argc * i; + + int vla[argc]; + memset(&vla[0], 0, argc); + + int index = large_stack(); + + // also check allocation of 0 size + volatile void *mem = __builtin_alloca(argc - 1); + + int volatile *dyn_mem = alloca(sizeof(static_mem) * argc); + for (size_t i = 0; i < argc * sizeof(static_mem) / sizeof(static_mem[0]); ++i) + dyn_mem[i] = argc * i; + + return static_mem[(7999 * argc) / 2] - dyn_mem[(7999 * argc) / 2] + vla[argc - index]; +} diff --git a/clang/test/Driver/stack-clash-protection.c b/clang/test/Driver/stack-clash-protection.c new file mode 100644 --- /dev/null +++ b/clang/test/Driver/stack-clash-protection.c @@ -0,0 +1,33 @@ +// RUN: %clang -target i386-unknown-linux -fstack-clash-protection -### %s 2>&1 | FileCheck %s -check-prefix=SCP-i386 +// RUN: %clang -target i386-unknown-linux -fnostack-clash-protection -fstack-clash-protection -### %s 2>&1 | FileCheck %s -check-prefix=SCP-i386 +// RUN: %clang -target i386-unknown-linux -fstack-clash-protection -fnostack-clash-protection -### %s 2>&1 | FileCheck %s -check-prefix=SCP-i386-NO +// SCP-i386: "-stack-clash-protection" +// SCP-i386-NO-NOT: "-stack-clash-protection" + +// RUN: %clang -target x86_64-scei-linux -fstack-clash-protection -### %s 2>&1 | FileCheck %s -check-prefix=SCP-x86 +// SCP-x86: "-stack-clash-protection" + +// RUN: %clang -target armv7k-apple-linux -fstack-clash-protection -### %s 2>&1 | FileCheck %s -check-prefix=SCP-armv7 +// SCP-armv7-NOT: "-stack-clash-protection" +// SCP-armv7: argument unused during compilation: '-fstack-clash-protection' + +// RUN: %clang -target x86_64-unknown-linux -fstack-clash-protection -c %s 2>&1 | FileCheck %s -check-prefix=SCP-warn +// SCP-warn: warning: Unable to protect inline asm that clobbers stack pointer against stack clash + +// RUN: %clang -target x86_64-pc-unknown-linux -fstack-clash-protection -S -emit-llvm -o- %s | FileCheck %s -check-prefix=SCP-ll-linux64 +// SCP-ll-linux64: attributes {{.*}} "probe-stack"="inline-asm" + +// RUN: %clang -target x86_64-pc-windows-msvc -fstack-clash-protection -S -emit-llvm -o- %s 2>&1 | FileCheck %s -check-prefix=SCP-ll-win64 +// SCP-ll-win64-NOT: attributes {{.*}} "probe-stack"="inline-asm" +// SCP-ll-win64: argument unused during compilation: '-fstack-clash-protection' + +int foo(int c) { + int r; + __asm__("sub %0, %%rsp" + : + : "rm"(c) + : "rsp"); + __asm__("mov %%rsp, %0" + : "=rm"(r)::); + return r; +} diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -84,6 +84,10 @@ During this release ... +* Functions with the probe-stack attribute set to "inline-asm" are now protected + against stack clash without the need of a third-party probing function and + with limited impact on performance. + Changes to the AMDGPU Target ----------------------------- diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -1653,6 +1653,10 @@ /// Returns the name of the symbol used to emit stack probes or the empty /// string if not applicable. + virtual bool hasStackProbeSymbol(MachineFunction &MF) const { return false; } + + virtual bool hasInlineStackProbe(MachineFunction &MF) const { return false; } + virtual StringRef getStackProbeSymbolName(MachineFunction &MF) const { return ""; } diff --git a/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/llvm/lib/Target/X86/X86CallFrameOptimization.cpp --- a/llvm/lib/Target/X86/X86CallFrameOptimization.cpp +++ b/llvm/lib/Target/X86/X86CallFrameOptimization.cpp @@ -162,14 +162,13 @@ // memory for arguments. unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode(); unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); - bool UseStackProbe = - !STI->getTargetLowering()->getStackProbeSymbolName(MF).empty(); + bool EmitStackProbeCall = STI->getTargetLowering()->hasStackProbeSymbol(MF); unsigned StackProbeSize = STI->getTargetLowering()->getStackProbeSize(MF); for (MachineBasicBlock &BB : MF) { bool InsideFrameSequence = false; for (MachineInstr &MI : BB) { if (MI.getOpcode() == FrameSetupOpcode) { - if (TII->getFrameSize(MI) >= StackProbeSize && UseStackProbe) + if (TII->getFrameSize(MI) >= StackProbeSize && EmitStackProbeCall) return false; if (InsideFrameSequence) return false; diff --git a/llvm/lib/Target/X86/X86FrameLowering.h b/llvm/lib/Target/X86/X86FrameLowering.h --- a/llvm/lib/Target/X86/X86FrameLowering.h +++ b/llvm/lib/Target/X86/X86FrameLowering.h @@ -189,11 +189,33 @@ void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const; + void emitStackProbeInlineWindowsCoreCLR64(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, + bool InProlog) const; + void emitStackProbeInlineGeneric(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, bool InProlog) const; + + void emitStackProbeInlineGenericBlock(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, + uint64_t Offset) const; + + void emitStackProbeInlineGenericLoop(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, + uint64_t Offset) const; /// Emit a stub to later inline the target stack probe. - void emitStackProbeInlineStub(MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, bool InProlog) const; + MachineInstr *emitStackProbeInlineStub(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, + bool InProlog) const; /// Aligns the stack pointer by ANDing it with -MaxAlign. void BuildStackAlignAND(MachineBasicBlock &MBB, diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -17,6 +17,7 @@ #include "X86Subtarget.h" #include "X86TargetMachine.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -32,6 +33,12 @@ #include "llvm/Target/TargetOptions.h" #include +#define DEBUG_TYPE "x86-fl" + +STATISTIC(NumFrameLoopProbe, "Number of loop stack probes used in prologue"); +STATISTIC(NumFrameExtraProbe, + "Number of extra stack probes generated in prologue"); + using namespace llvm; X86FrameLowering::X86FrameLowering(const X86Subtarget &STI, @@ -257,7 +264,27 @@ uint64_t Chunk = (1LL << 31) - 1; - if (Offset > Chunk) { + MachineFunction &MF = *MBB.getParent(); + const X86Subtarget &STI = MF.getSubtarget(); + const X86TargetLowering &TLI = *STI.getTargetLowering(); + const bool EmitInlineStackProbe = TLI.hasInlineStackProbe(MF); + + // It's ok to not take into account large chunks when probing, as the + // allocation is split in smaller chunks anyway. + if (EmitInlineStackProbe && !InEpilogue) { + + // stack probing may involve looping, and control flow generations is + // disallowed at this point. Rely to later processing through + // `inlineStackProbe`. + MachineInstr *Stub = emitStackProbeInlineStub(MF, MBB, MBBI, DL, true); + + // Encode the static offset as a metadata attached to the stub. + LLVMContext &Context = MF.getFunction().getContext(); + MachineInstrBuilder(MF, Stub).addMetadata( + MDTuple::get(Context, {ConstantAsMetadata::get(ConstantInt::get( + IntegerType::get(Context, 64), Offset))})); + return; + } else if (Offset > Chunk) { // Rather than emit a long series of instructions for large offsets, // load the offset into a register and do one sub/add unsigned Reg = 0; @@ -381,8 +408,8 @@ } else { bool IsSub = Offset < 0; uint64_t AbsOffset = IsSub ? -Offset : Offset; - unsigned Opc = IsSub ? getSUBriOpcode(Uses64BitFramePtr, AbsOffset) - : getADDriOpcode(Uses64BitFramePtr, AbsOffset); + const unsigned Opc = IsSub ? getSUBriOpcode(Uses64BitFramePtr, AbsOffset) + : getADDriOpcode(Uses64BitFramePtr, AbsOffset); MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) .addReg(StackPtr) .addImm(AbsOffset); @@ -528,6 +555,169 @@ const DebugLoc &DL, bool InProlog) const { const X86Subtarget &STI = MF.getSubtarget(); + if (STI.isTargetWindowsCoreCLR() && STI.is64Bit()) + emitStackProbeInlineWindowsCoreCLR64(MF, MBB, MBBI, DL, InProlog); + else + emitStackProbeInlineGeneric(MF, MBB, MBBI, DL, InProlog); +} + +void X86FrameLowering::emitStackProbeInlineGeneric( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const { + MachineInstr &CallToInline = *std::prev(MBBI); + assert(CallToInline.getOperand(1).isMetadata() && + "no metadata attached to that probe"); + uint64_t Offset = + cast( + cast( + cast(CallToInline.getOperand(1).getMetadata()) + ->getOperand(0)) + ->getValue()) + ->getZExtValue(); + + const X86Subtarget &STI = MF.getSubtarget(); + const X86TargetLowering &TLI = *STI.getTargetLowering(); + assert(!(STI.is64Bit() && STI.isTargetWindowsCoreCLR()) && + "different expansion expected for CoreCLR 64 bit"); + + const uint64_t StackProbeSize = TLI.getStackProbeSize(MF); + uint64_t ProbeChunk = StackProbeSize * 8; + + // Synthesize a loop or unroll it, depending on the number of iterations. + if (Offset > ProbeChunk) { + emitStackProbeInlineGenericLoop(MF, MBB, MBBI, DL, Offset); + } else { + emitStackProbeInlineGenericBlock(MF, MBB, MBBI, DL, Offset); + } +} + +void X86FrameLowering::emitStackProbeInlineGenericBlock( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, + uint64_t Offset) const { + + const X86Subtarget &STI = MF.getSubtarget(); + const X86TargetLowering &TLI = *STI.getTargetLowering(); + const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset); + const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi; + const uint64_t StackProbeSize = TLI.getStackProbeSize(MF); + uint64_t CurrentOffset = 0; + // 0 Thanks to return address being saved on the stack + uint64_t CurrentProbeOffset = 0; + + // For the first N - 1 pages, just probe. I tried to take advantage of + // natural probes but it implies much more logic and there was very few + // interesting natural probes to interleave. + while (CurrentOffset + StackProbeSize < Offset) { + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) + .addReg(StackPtr) + .addImm(StackProbeSize) + .setMIFlag(MachineInstr::FrameSetup); + MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. + + + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc)) + .setMIFlag(MachineInstr::FrameSetup), + StackPtr, false, 0) + .addImm(0) + .setMIFlag(MachineInstr::FrameSetup); + NumFrameExtraProbe++; + CurrentOffset += StackProbeSize; + CurrentProbeOffset += StackProbeSize; + } + + uint64_t ChunkSize = Offset - CurrentOffset; + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) + .addReg(StackPtr) + .addImm(ChunkSize) + .setMIFlag(MachineInstr::FrameSetup); + MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. +} + +void X86FrameLowering::emitStackProbeInlineGenericLoop( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, + uint64_t Offset) const { + + const X86Subtarget &STI = MF.getSubtarget(); + const X86TargetLowering &TLI = *STI.getTargetLowering(); + const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi; + const uint64_t StackProbeSize = TLI.getStackProbeSize(MF); + + // Synthesize a loop + NumFrameLoopProbe++; + const BasicBlock *LLVM_BB = MBB.getBasicBlock(); + + MachineBasicBlock *testMBB = MF.CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *tailMBB = MF.CreateMachineBasicBlock(LLVM_BB); + + MachineFunction::iterator MBBIter = ++MBB.getIterator(); + MF.insert(MBBIter, testMBB); + MF.insert(MBBIter, tailMBB); + + unsigned FinalStackPtr = Uses64BitFramePtr ? X86::R11 : X86::R11D; + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FinalStackPtr) + .addReg(StackPtr) + .setMIFlag(MachineInstr::FrameSetup); + + // save loop bound + { + const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset); + BuildMI(MBB, MBBI, DL, TII.get(Opc), FinalStackPtr) + .addReg(FinalStackPtr) + .addImm(Offset / StackProbeSize * StackProbeSize) + .setMIFlag(MachineInstr::FrameSetup); + } + + // allocate a page + { + const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, StackProbeSize); + BuildMI(testMBB, DL, TII.get(Opc), StackPtr) + .addReg(StackPtr) + .addImm(StackProbeSize) + .setMIFlag(MachineInstr::FrameSetup); + } + + // touch the page + addRegOffset(BuildMI(testMBB, DL, TII.get(MovMIOpc)) + .setMIFlag(MachineInstr::FrameSetup), + StackPtr, false, 0) + .addImm(0) + .setMIFlag(MachineInstr::FrameSetup); + + // cmp with stack pointer bound + BuildMI(testMBB, DL, TII.get(IsLP64 ? X86::CMP64rr : X86::CMP32rr)) + .addReg(StackPtr) + .addReg(FinalStackPtr) + .setMIFlag(MachineInstr::FrameSetup); + + // jump + BuildMI(testMBB, DL, TII.get(X86::JCC_1)) + .addMBB(testMBB) + .addImm(X86::COND_NE) + .setMIFlag(MachineInstr::FrameSetup); + testMBB->addSuccessor(testMBB); + testMBB->addSuccessor(tailMBB); + + // allocate a block and touch it + + tailMBB->splice(tailMBB->end(), &MBB, MBBI, MBB.end()); + tailMBB->transferSuccessorsAndUpdatePHIs(&MBB); + MBB.addSuccessor(testMBB); + + if (Offset % StackProbeSize) { + const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset); + BuildMI(*tailMBB, tailMBB->begin(), DL, TII.get(Opc), StackPtr) + .addReg(StackPtr) + .addImm(Offset % StackProbeSize) + .setMIFlag(MachineInstr::FrameSetup); + } +} + +void X86FrameLowering::emitStackProbeInlineWindowsCoreCLR64( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const { + const X86Subtarget &STI = MF.getSubtarget(); assert(STI.is64Bit() && "different expansion needed for 32 bit"); assert(STI.isTargetWindowsCoreCLR() && "custom expansion expects CoreCLR"); const TargetInstrInfo &TII = *STI.getInstrInfo(); @@ -821,13 +1011,13 @@ } } -void X86FrameLowering::emitStackProbeInlineStub( +MachineInstr *X86FrameLowering::emitStackProbeInlineStub( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const { assert(InProlog && "ChkStkStub called outside prolog!"); - BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32)) + return BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32)) .addExternalSymbol("__chkstk_stub"); } @@ -1014,7 +1204,8 @@ X86FI->setCalleeSavedFrameSize( X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta); - bool UseStackProbe = !STI.getTargetLowering()->getStackProbeSymbolName(MF).empty(); + const bool EmitStackProbeCall = + STI.getTargetLowering()->hasStackProbeSymbol(MF); unsigned StackProbeSize = STI.getTargetLowering()->getStackProbeSize(MF); // Re-align the stack on 64-bit if the x86-interrupt calling convention is @@ -1032,11 +1223,10 @@ // pointer, calls, or dynamic alloca then we do not need to adjust the // stack pointer (we fit in the Red Zone). We also check that we don't // push and pop from the stack. - if (has128ByteRedZone(MF) && - !TRI->needsStackRealignment(MF) && + if (has128ByteRedZone(MF) && !TRI->needsStackRealignment(MF) && !MFI.hasVarSizedObjects() && // No dynamic alloca. !MFI.adjustsStack() && // No calls. - !UseStackProbe && // No stack probes. + !EmitStackProbeCall && // No stack probes. !MFI.hasCopyImplyingStackAdjustment() && // Don't push and pop. !MF.shouldSplitStack()) { // Regular stack uint64_t MinSize = X86FI->getCalleeSavedFrameSize(); @@ -1237,7 +1427,7 @@ uint64_t AlignedNumBytes = NumBytes; if (IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF)) AlignedNumBytes = alignTo(AlignedNumBytes, MaxAlign); - if (AlignedNumBytes >= StackProbeSize && UseStackProbe) { + if (AlignedNumBytes >= StackProbeSize && EmitStackProbeCall) { assert(!X86FI->getUsesRedZone() && "The Red Zone is not accounted for in stack probes"); diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -537,6 +537,10 @@ // falls back to heap allocation if not. SEG_ALLOCA, + // For allocating stack space when using stack clash protector. + // Allocation is performed by block, and each block is probed. + PROBED_ALLOCA, + // Memory barriers. MEMBARRIER, MFENCE, @@ -1250,6 +1254,8 @@ bool supportSwiftError() const override; + bool hasStackProbeSymbol(MachineFunction &MF) const override; + bool hasInlineStackProbe(MachineFunction &MF) const override; StringRef getStackProbeSymbolName(MachineFunction &MF) const override; unsigned getStackProbeSize(MachineFunction &MF) const; @@ -1475,6 +1481,9 @@ MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr &MI, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredProbedAlloca(MachineInstr &MI, + MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredTLSAddr(MachineInstr &MI, MachineBasicBlock *BB) const; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -23077,9 +23077,9 @@ SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); bool SplitStack = MF.shouldSplitStack(); - bool EmitStackProbe = !getStackProbeSymbolName(MF).empty(); + bool EmitStackProbeCall = hasStackProbeSymbol(MF); bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) || - SplitStack || EmitStackProbe; + SplitStack || EmitStackProbeCall; SDLoc dl(Op); // Get the inputs. @@ -23103,11 +23103,21 @@ assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and" " not tell us which reg is the stack pointer!"); - SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); - Chain = SP.getValue(1); const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); const Align StackAlign(TFI.getStackAlignment()); - Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value + if (hasInlineStackProbe(MF)) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + + const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy); + Register Vreg = MRI.createVirtualRegister(AddrRegClass); + Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); + Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain, + DAG.getRegister(Vreg, SPTy)); + } else { + SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); + Chain = SP.getValue(1); + Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value + } if (Alignment && Alignment > StackAlign) Result = DAG.getNode(ISD::AND, dl, VT, Result, @@ -29837,6 +29847,8 @@ case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; case X86ISD::MFENCE: return "X86ISD::MFENCE"; case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; + case X86ISD::PROBED_ALLOCA: + return "X86ISD::PROBED_ALLOCA"; case X86ISD::SAHF: return "X86ISD::SAHF"; case X86ISD::RDRAND: return "X86ISD::RDRAND"; case X86ISD::RDSEED: return "X86ISD::RDSEED"; @@ -31102,6 +31114,97 @@ } MachineBasicBlock * +X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI, + MachineBasicBlock *BB) const { + MachineFunction *MF = BB->getParent(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + const X86FrameLowering &TFI = *Subtarget.getFrameLowering(); + DebugLoc DL = MI.getDebugLoc(); + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + + const unsigned ProbeSize = getStackProbeSize(*MF); + + MachineRegisterInfo &MRI = MF->getRegInfo(); + MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB); + + MachineFunction::iterator MBBIter = ++BB->getIterator(); + MF->insert(MBBIter, testMBB); + MF->insert(MBBIter, blockMBB); + MF->insert(MBBIter, tailMBB); + + unsigned sizeVReg = MI.getOperand(1).getReg(); + + const TargetRegisterClass *SizeRegClass = MRI.getRegClass(sizeVReg); + + unsigned tmpSizeVReg = MRI.createVirtualRegister(SizeRegClass); + unsigned tmpSizeVReg2 = MRI.createVirtualRegister(SizeRegClass); + + unsigned physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP; + + // test rsp size + BuildMI(testMBB, DL, TII->get(X86::PHI), tmpSizeVReg) + .addReg(sizeVReg) + .addMBB(BB) + .addReg(tmpSizeVReg2) + .addMBB(blockMBB); + + BuildMI(testMBB, DL, + TII->get(TFI.Uses64BitFramePtr ? X86::CMP64ri32 : X86::CMP32ri)) + .addReg(tmpSizeVReg) + .addImm(ProbeSize); + + BuildMI(testMBB, DL, TII->get(X86::JCC_1)) + .addMBB(tailMBB) + .addImm(X86::COND_L); + testMBB->addSuccessor(blockMBB); + testMBB->addSuccessor(tailMBB); + + // allocate a block and touch it + + BuildMI(blockMBB, DL, + TII->get(TFI.Uses64BitFramePtr ? X86::SUB64ri32 : X86::SUB32ri), + tmpSizeVReg2) + .addReg(tmpSizeVReg) + .addImm(ProbeSize); + + BuildMI(blockMBB, DL, + TII->get(TFI.Uses64BitFramePtr ? X86::SUB64ri32 : X86::SUB32ri), + physSPReg) + .addReg(physSPReg) + .addImm(ProbeSize); + + const unsigned MovMIOpc = + TFI.Uses64BitFramePtr ? X86::MOV64mi32 : X86::MOV32mi; + addRegOffset(BuildMI(blockMBB, DL, TII->get(MovMIOpc)), physSPReg, false, 0) + .addImm(0); + + BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB); + blockMBB->addSuccessor(testMBB); + + // allocate the tail and continue + BuildMI(tailMBB, DL, + TII->get(TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr), + physSPReg) + .addReg(physSPReg) + .addReg(tmpSizeVReg); + BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg()) + .addReg(physSPReg); + + tailMBB->splice(tailMBB->end(), BB, + std::next(MachineBasicBlock::iterator(MI)), BB->end()); + tailMBB->transferSuccessorsAndUpdatePHIs(BB); + BB->addSuccessor(testMBB); + + // Delete the original pseudo instruction. + MI.eraseFromParent(); + + // And we're done. + return tailMBB; +} + +MachineBasicBlock * X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); @@ -32276,6 +32379,9 @@ case X86::SEG_ALLOCA_32: case X86::SEG_ALLOCA_64: return EmitLoweredSegAlloca(MI, BB); + case X86::PROBED_ALLOCA_32: + case X86::PROBED_ALLOCA_64: + return EmitLoweredProbedAlloca(MI, BB); case X86::TLSCall_32: case X86::TLSCall_64: return EmitLoweredTLSCall(MI, BB); @@ -47342,10 +47448,35 @@ return Subtarget.is64Bit(); } +/// Returns true if stack probing through a function call is requested. +bool X86TargetLowering::hasStackProbeSymbol(MachineFunction &MF) const { + return !getStackProbeSymbolName(MF).empty(); +} + +/// Returns true if stack probing through inline assembly is requested. +bool X86TargetLowering::hasInlineStackProbe(MachineFunction &MF) const { + + // No inline stack probe for Windows, they have their own mechanism. + if (Subtarget.isOSWindows() || + MF.getFunction().hasFnAttribute("no-stack-arg-probe")) + return false; + + // If the function specifically requests inline stack probes, emit them. + if (MF.getFunction().hasFnAttribute("probe-stack")) + return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() == + "inline-asm"; + + return false; +} + /// Returns the name of the symbol used to emit stack probes or the empty /// string if not applicable. StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const { + // Inline Stack probes disable stack probe call + if (hasInlineStackProbe(MF)) + return ""; + // If the function specifically requests stack probes, emit them. if (MF.getFunction().hasFnAttribute("probe-stack")) return MF.getFunction().getFnAttribute("probe-stack").getValueAsString(); diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -111,6 +111,23 @@ [(set GR64:$dst, (X86SegAlloca GR64:$size))]>, Requires<[In64BitMode]>; + +// To protect against stack clash, dynamic allocation should perform a memory +// probe at each page. + +let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in +def PROBED_ALLOCA_32 : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$size), + "# variable sized alloca with probing", + [(set GR32:$dst, + (X86ProbedAlloca GR32:$size))]>, + Requires<[NotLP64]>; + +let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in +def PROBED_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size), + "# variable sized alloca with probing", + [(set GR64:$dst, + (X86ProbedAlloca GR64:$size))]>, + Requires<[In64BitMode]>; } // Dynamic stack allocation yields a _chkstk or _alloca call for all Windows diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td --- a/llvm/lib/Target/X86/X86InstrInfo.td +++ b/llvm/lib/Target/X86/X86InstrInfo.td @@ -121,6 +121,8 @@ def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>; +def SDT_X86PROBED_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>; + def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>; def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>; @@ -294,6 +296,9 @@ def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA, [SDNPHasChain]>; +def X86ProbedAlloca : SDNode<"X86ISD::PROBED_ALLOCA", SDT_X86PROBED_ALLOCA, + [SDNPHasChain]>; + def X86TLSCall : SDNode<"X86ISD::TLSCALL", SDT_X86TLSCALL, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; diff --git a/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll b/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll @@ -0,0 +1,44 @@ +; RUN: llc < %s | FileCheck %s + + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @foo(i32 %n) local_unnamed_addr #0 { + +; CHECK-LABEL: foo: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: leaq 15(,%rax,4), %rax +; CHECK-NEXT: andq $-16, %rax +; CHECK-NEXT: cmpq $4096, %rax # imm = 0x1000 +; CHECK-NEXT: jl .LBB0_3 +; CHECK-NEXT: .LBB0_2: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: subq $4096, %rax # imm = 0x1000 +; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-NEXT: movq $0, (%rsp) +; CHECK-NEXT: cmpq $4096, %rax # imm = 0x1000 +; CHECK-NEXT: jge .LBB0_2 +; CHECK-NEXT: .LBB0_3: +; CHECK-NEXT: subq %rax, %rsp +; CHECK-NEXT: movq %rsp, %rax +; CHECK-NEXT: movl $1, 4792(%rax) +; CHECK-NEXT: movl (%rax), %eax +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: retq + + %a = alloca i32, i32 %n, align 16 + %b = getelementptr inbounds i32, i32* %a, i64 1198 + store volatile i32 1, i32* %b + %c = load volatile i32, i32* %a + ret i32 %c +} + +attributes #0 = {"probe-stack"="inline-asm"} diff --git a/llvm/test/CodeGen/X86/stack-clash-large.ll b/llvm/test/CodeGen/X86/stack-clash-large.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/stack-clash-large.ll @@ -0,0 +1,38 @@ +; RUN: llc < %s | FileCheck %s + + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @foo() local_unnamed_addr #0 { + +; CHECK-LABEL: foo: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rsp, %r11 +; CHECK-NEXT: subq $69632, %r11 # imm = 0x11000 +; CHECK-NEXT: .LBB0_1: +; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-NEXT: movq $0, (%rsp) +; CHECK-NEXT: cmpq %r11, %rsp +; CHECK-NEXT: jne .LBB0_1 +; CHECK-NEXT:# %bb.2: +; CHECK-NEXT: subq $2248, %rsp # imm = 0x8C8 +; CHECK-NEXT: .cfi_def_cfa_offset 71888 +; CHECK-NEXT: movl $1, 264(%rsp) +; CHECK-NEXT: movl $1, 28664(%rsp) +; CHECK-NEXT: movl -128(%rsp), %eax +; CHECK-NEXT: addq $71880, %rsp # imm = 0x118C8 +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + + + %a = alloca i32, i64 18000, align 16 + %b0 = getelementptr inbounds i32, i32* %a, i64 98 + %b1 = getelementptr inbounds i32, i32* %a, i64 7198 + store volatile i32 1, i32* %b0 + store volatile i32 1, i32* %b1 + %c = load volatile i32, i32* %a + ret i32 %c +} + +attributes #0 = {"probe-stack"="inline-asm"} diff --git a/llvm/test/CodeGen/X86/stack-clash-medium-natural-probes-mutliple-objects.ll b/llvm/test/CodeGen/X86/stack-clash-medium-natural-probes-mutliple-objects.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/stack-clash-medium-natural-probes-mutliple-objects.ll @@ -0,0 +1,32 @@ +; RUN: llc < %s | FileCheck %s + + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @foo() local_unnamed_addr #0 { + +; CHECK-LABEL: foo: +; CHECK: # %bb.0: +; CHECK-NEXT subq $4096, %rsp # imm = 0x1000 +; CHECK-NEXT .cfi_def_cfa_offset 5888 +; CHECK-NEXT movl $1, 2088(%rsp) +; CHECK-NEXT subq $1784, %rsp # imm = 0x6F8 +; CHECK-NEXT movl $2, 672(%rsp) +; CHECK-NEXT movl 1872(%rsp), %eax +; CHECK-NEXT addq $5880, %rsp # imm = 0x16F8 +; CHECK-NEXT .cfi_def_cfa_offset 8 +; CHECK-NEXT retq + + + %a = alloca i32, i64 1000, align 16 + %b = alloca i32, i64 500, align 16 + %a0 = getelementptr inbounds i32, i32* %a, i64 500 + %b0 = getelementptr inbounds i32, i32* %b, i64 200 + store volatile i32 1, i32* %a0 + store volatile i32 2, i32* %b0 + %c = load volatile i32, i32* %a + ret i32 %c +} + +attributes #0 = {"probe-stack"="inline-asm"} diff --git a/llvm/test/CodeGen/X86/stack-clash-medium-natural-probes.ll b/llvm/test/CodeGen/X86/stack-clash-medium-natural-probes.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/stack-clash-medium-natural-probes.ll @@ -0,0 +1,33 @@ +; RUN: llc < %s | FileCheck %s + + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @foo() local_unnamed_addr #0 { + +; CHECK-LABEL: foo: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-NEXT: movq $0, (%rsp) +; CHECK-NEXT: subq $3784, %rsp # imm = 0xEC8 +; CHECK-NEXT: .cfi_def_cfa_offset 7888 +; CHECK-NEXT: movl $1, 264(%rsp) +; CHECK-NEXT: movl $1, 4664(%rsp) +; CHECK-NEXT: movl -128(%rsp), %eax +; CHECK-NEXT: addq $7880, %rsp # imm = 0x1EC8 +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + + + + %a = alloca i32, i64 2000, align 16 + %b0 = getelementptr inbounds i32, i32* %a, i64 98 + %b1 = getelementptr inbounds i32, i32* %a, i64 1198 + store i32 1, i32* %b0 + store i32 1, i32* %b1 + %c = load volatile i32, i32* %a + ret i32 %c +} + +attributes #0 = {"probe-stack"="inline-asm"} diff --git a/llvm/test/CodeGen/X86/stack-clash-medium.ll b/llvm/test/CodeGen/X86/stack-clash-medium.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/stack-clash-medium.ll @@ -0,0 +1,30 @@ +; RUN: llc < %s | FileCheck %s + + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @foo() local_unnamed_addr #0 { + +; CHECK-LABEL: foo: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-NEXT: movq $0, (%rsp) +; CHECK-NEXT: subq $3784, %rsp # imm = 0xEC8 +; CHECK-NEXT: .cfi_def_cfa_offset 7888 +; CHECK-NEXT: movl $1, 672(%rsp) +; CHECK-NEXT: movl -128(%rsp), %eax +; CHECK-NEXT: addq $7880, %rsp # imm = 0x1EC8 +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + + + + %a = alloca i32, i64 2000, align 16 + %b = getelementptr inbounds i32, i32* %a, i64 200 + store volatile i32 1, i32* %b + %c = load volatile i32, i32* %a + ret i32 %c +} + +attributes #0 = {"probe-stack"="inline-asm"} diff --git a/llvm/test/CodeGen/X86/stack-clash-no-free-probe.ll b/llvm/test/CodeGen/X86/stack-clash-no-free-probe.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/stack-clash-no-free-probe.ll @@ -0,0 +1,27 @@ +; RUN: llc < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @foo(i64 %i) local_unnamed_addr #0 { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-NEXT: movq $0, (%rsp) +; CHECK-NEXT: subq $3784, %rsp # imm = 0xEC8 +; CHECK-NEXT: .cfi_def_cfa_offset 7888 +; CHECK-NEXT: movl $1, -128(%rsp,%rdi,4) +; CHECK-NEXT: movl -128(%rsp), %eax +; CHECK-NEXT: addq $7880, %rsp # imm = 0x1EC8 +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + + %a = alloca i32, i32 2000, align 16 + %b = getelementptr inbounds i32, i32* %a, i64 %i + store volatile i32 1, i32* %b + %c = load volatile i32, i32* %a + ret i32 %c +} + +attributes #0 = {"probe-stack"="inline-asm"} + diff --git a/llvm/test/CodeGen/X86/stack-clash-small.ll b/llvm/test/CodeGen/X86/stack-clash-small.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/stack-clash-small.ll @@ -0,0 +1,25 @@ +; RUN: llc < %s | FileCheck %s + + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @foo() local_unnamed_addr #0 { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $280, %rsp # imm = 0x118 +; CHECK-NEXT: .cfi_def_cfa_offset 288 +; CHECK-NEXT: movl $1, 264(%rsp) +; CHECK-NEXT: movl -128(%rsp), %eax +; CHECK-NEXT: addq $280, %rsp # imm = 0x118 +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + + %a = alloca i32, i64 100, align 16 + %b = getelementptr inbounds i32, i32* %a, i64 98 + store volatile i32 1, i32* %b + %c = load volatile i32, i32* %a + ret i32 %c +} + +attributes #0 = {"probe-stack"="inline-asm"} diff --git a/llvm/test/CodeGen/X86/stack-clash-unknown-call.ll b/llvm/test/CodeGen/X86/stack-clash-unknown-call.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/stack-clash-unknown-call.ll @@ -0,0 +1,31 @@ +; RUN: llc < %s | FileCheck %s + + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg); + +define void @foo() local_unnamed_addr #0 { + +;CHECK-LABEL: foo: +;CHECK: # %bb.0: +;CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 +; it's important that we don't use the call as a probe here +;CHECK-NEXT: movq $0, (%rsp) +;CHECK-NEXT: subq $3912, %rsp # imm = 0xF48 +;CHECK-NEXT: .cfi_def_cfa_offset 8016 +;CHECK-NEXT: movq %rsp, %rdi +;CHECK-NEXT: movl $8000, %edx # imm = 0x1F40 +;CHECK-NEXT: xorl %esi, %esi +;CHECK-NEXT: callq memset +;CHECK-NEXT: addq $8008, %rsp # imm = 0x1F48 +;CHECK-NEXT: .cfi_def_cfa_offset 8 +;CHECK-NEXT: retq + + %a = alloca i8, i64 8000, align 16 + call void @llvm.memset.p0i8.i64(i8* align 16 %a, i8 0, i64 8000, i1 false) + ret void +} + +attributes #0 = {"probe-stack"="inline-asm"}