Index: include/llvm/CodeGen/CommandFlags.def =================================================================== --- include/llvm/CodeGen/CommandFlags.def +++ include/llvm/CodeGen/CommandFlags.def @@ -259,6 +259,11 @@ "stack-size-section", cl::desc("Emit a section containing stack size metadata"), cl::init(false)); +static cl::opt + EnableStackProbe("stack-probe", + cl::desc("Enable stack probing for non-Windows targets"), + cl::init(false)); + // Common utility function tightly tied to the options listed here. Initializes // a TargetOptions object with CodeGen flags and returns it. static TargetOptions InitTargetOptionsFromCodeGenFlags() { @@ -286,6 +291,7 @@ Options.EmulatedTLS = EmulatedTLS; Options.ExceptionModel = ExceptionModel; Options.EmitStackSizeSection = EnableStackSizeSection; + Options.EnableStackProbe = EnableStackProbe; Options.MCOptions = InitMCTargetOptionsFromFlags(); Index: include/llvm/Target/TargetOptions.h =================================================================== --- include/llvm/Target/TargetOptions.h +++ include/llvm/Target/TargetOptions.h @@ -108,7 +108,7 @@ DisableIntegratedAS(false), RelaxELFRelocations(false), FunctionSections(false), DataSections(false), UniqueSectionNames(true), TrapUnreachable(false), EmulatedTLS(false), - EnableIPRA(false), EmitStackSizeSection(false) {} + EnableIPRA(false), EmitStackSizeSection(false), EnableStackProbe(false) {} /// PrintMachineCode - This flag is enabled when the -print-machineinstrs /// option is specified on the command line, and should enable debugging @@ -219,6 +219,9 @@ /// Emit section containing metadata on function stack sizes. unsigned EmitStackSizeSection : 1; + /// Enable stack probing for non-Windows targets (if supported). + unsigned EnableStackProbe : 1; + /// FloatABIType - This setting is set by -float-abi=xxx option is specfied /// on the command line. This setting may either be Default, Soft, or Hard. /// Default selects the target's default behavior. Soft selects the ABI for Index: lib/Target/X86/CMakeLists.txt =================================================================== --- lib/Target/X86/CMakeLists.txt +++ lib/Target/X86/CMakeLists.txt @@ -55,7 +55,7 @@ X86TargetObjectFile.cpp X86TargetTransformInfo.cpp X86VZeroUpper.cpp - X86WinAllocaExpander.cpp + X86DynAllocaExpander.cpp X86WinEHState.cpp X86CallingConv.cpp ) Index: lib/Target/X86/X86.h =================================================================== --- lib/Target/X86/X86.h +++ lib/Target/X86/X86.h @@ -65,8 +65,8 @@ /// Return a pass that transforms setcc + movzx pairs into xor + setcc. FunctionPass *createX86FixupSetCC(); -/// Return a pass that expands WinAlloca pseudo-instructions. -FunctionPass *createX86WinAllocaExpander(); +/// Return a pass that expands DynAlloca pseudo-instructions. +FunctionPass *createX86DynAllocaExpander(); /// Return a pass that optimizes the code-size of x86 call sequences. This is /// done by replacing esp-relative movs with pushes. Index: lib/Target/X86/X86DynAllocaExpander.cpp =================================================================== --- lib/Target/X86/X86DynAllocaExpander.cpp +++ lib/Target/X86/X86DynAllocaExpander.cpp @@ -1,4 +1,4 @@ -//===----- X86WinAllocaExpander.cpp - Expand WinAlloca pseudo instruction -===// +//===----- X86DynAllocaExpander.cpp - Expand DynAlloca pseudo instruction -===// // // The LLVM Compiler Infrastructure // @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// // -// This file defines a pass that expands WinAlloca pseudo-instructions. +// This file defines a pass that expands DynAlloca pseudo-instructions. // // It performs a conservative analysis to determine whether each allocation // falls within a region of the stack that is safe to use, or whether stack @@ -33,26 +33,26 @@ namespace { -class X86WinAllocaExpander : public MachineFunctionPass { +class X86DynAllocaExpander : public MachineFunctionPass { public: - X86WinAllocaExpander() : MachineFunctionPass(ID) {} + X86DynAllocaExpander() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; private: - /// Strategies for lowering a WinAlloca. + /// Strategies for lowering a DynAlloca. enum Lowering { TouchAndSub, Sub, Probe }; - /// Deterministic-order map from WinAlloca instruction to desired lowering. + /// Deterministic-order map from DynAlloca instruction to desired lowering. typedef MapVector LoweringMap; - /// Compute which lowering to use for each WinAlloca instruction. + /// Compute which lowering to use for each DynAlloca instruction. void computeLowerings(MachineFunction &MF, LoweringMap& Lowerings); /// Get the appropriate lowering based on current offset and amount. Lowering getLowering(int64_t CurrentOffset, int64_t AllocaAmount); - /// Lower a WinAlloca instruction. + /// Lower a DynAlloca instruction. void lower(MachineInstr* MI, Lowering L); MachineRegisterInfo *MRI; @@ -63,22 +63,22 @@ unsigned SlotSize; int64_t StackProbeSize; - StringRef getPassName() const override { return "X86 WinAlloca Expander"; } + StringRef getPassName() const override { return "X86 DynAlloca Expander"; } static char ID; }; -char X86WinAllocaExpander::ID = 0; +char X86DynAllocaExpander::ID = 0; } // end anonymous namespace -FunctionPass *llvm::createX86WinAllocaExpander() { - return new X86WinAllocaExpander(); +FunctionPass *llvm::createX86DynAllocaExpander() { + return new X86DynAllocaExpander(); } -/// Return the allocation amount for a WinAlloca instruction, or -1 if unknown. -static int64_t getWinAllocaAmount(MachineInstr *MI, MachineRegisterInfo *MRI) { - assert(MI->getOpcode() == X86::WIN_ALLOCA_32 || - MI->getOpcode() == X86::WIN_ALLOCA_64); +/// Return the allocation amount for a DynAlloca instruction, or -1 if unknown. +static int64_t getDynAllocaAmount(MachineInstr *MI, MachineRegisterInfo *MRI) { + assert(MI->getOpcode() == X86::DYN_ALLOCA_32 || + MI->getOpcode() == X86::DYN_ALLOCA_64); assert(MI->getOperand(0).isReg()); unsigned AmountReg = MI->getOperand(0).getReg(); @@ -96,8 +96,8 @@ return Def->getOperand(1).getImm(); } -X86WinAllocaExpander::Lowering -X86WinAllocaExpander::getLowering(int64_t CurrentOffset, +X86DynAllocaExpander::Lowering +X86DynAllocaExpander::getLowering(int64_t CurrentOffset, int64_t AllocaAmount) { // For a non-constant amount or a large amount, we have to probe. if (AllocaAmount < 0 || AllocaAmount > StackProbeSize) @@ -131,11 +131,11 @@ } } -void X86WinAllocaExpander::computeLowerings(MachineFunction &MF, +void X86DynAllocaExpander::computeLowerings(MachineFunction &MF, LoweringMap &Lowerings) { // Do a one-pass reverse post-order walk of the CFG to conservatively estimate // the offset between the stack pointer and the lowest touched part of the - // stack, and use that to decide how to lower each WinAlloca instruction. + // stack, and use that to decide how to lower each DynAlloca instruction. // Initialize OutOffset[B], the stack offset at exit from B, to something big. DenseMap OutOffset; @@ -156,10 +156,10 @@ if (Offset == -1) Offset = INT32_MAX; for (MachineInstr &MI : *MBB) { - if (MI.getOpcode() == X86::WIN_ALLOCA_32 || - MI.getOpcode() == X86::WIN_ALLOCA_64) { - // A WinAlloca moves StackPtr, and potentially touches it. - int64_t Amount = getWinAllocaAmount(&MI, MRI); + if (MI.getOpcode() == X86::DYN_ALLOCA_32 || + MI.getOpcode() == X86::DYN_ALLOCA_64) { + // A DynAlloca moves StackPtr, and potentially touches it. + int64_t Amount = getDynAllocaAmount(&MI, MRI); Lowering L = getLowering(Offset, Amount); Lowerings[&MI] = L; switch (L) { @@ -198,12 +198,12 @@ return isInt<8>(Amount) ? X86::SUB32ri8 : X86::SUB32ri; } -void X86WinAllocaExpander::lower(MachineInstr* MI, Lowering L) { +void X86DynAllocaExpander::lower(MachineInstr* MI, Lowering L) { DebugLoc DL = MI->getDebugLoc(); MachineBasicBlock *MBB = MI->getParent(); MachineBasicBlock::iterator I = *MI; - int64_t Amount = getWinAllocaAmount(MI, MRI); + int64_t Amount = getDynAllocaAmount(MI, MRI); if (Amount == 0) { MI->eraseFromParent(); return; @@ -267,8 +267,8 @@ } } -bool X86WinAllocaExpander::runOnMachineFunction(MachineFunction &MF) { - if (!MF.getInfo()->hasWinAlloca()) +bool X86DynAllocaExpander::runOnMachineFunction(MachineFunction &MF) { + if (!MF.getInfo()->hasDynAlloca()) return false; MRI = &MF.getRegInfo(); Index: lib/Target/X86/X86FrameLowering.cpp =================================================================== --- lib/Target/X86/X86FrameLowering.cpp +++ lib/Target/X86/X86FrameLowering.cpp @@ -779,8 +779,9 @@ // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp // themselves. They also does not clobber %rax so we can reuse it when // adjusting %rsp. - // All other platforms do not specify a particular ABI for the stack probe - // function, so we arbitrarily define it to not adjust %esp/%rsp itself. + // All other platforms including Darwin do not specify a particular ABI for + // the stack probe function, so we arbitrarily define it to not adjust + // %esp/%rsp itself. BuildMI(MBB, MBBI, DL, TII.get(getSUBrrOpcode(Is64Bit)), SP) .addReg(SP) .addReg(AX); @@ -982,7 +983,8 @@ // The default stack probe size is 4096 if the function has no stackprobesize // attribute. unsigned StackProbeSize = 4096; - if (Fn->hasFnAttribute("stack-probe-size")) + // On Darwin the probe size is fixed so don't support overriding it. + if (!STI.isTargetDarwin() && Fn->hasFnAttribute("stack-probe-size")) Fn->getFnAttribute("stack-probe-size") .getValueAsString() .getAsInteger(0, StackProbeSize); @@ -997,6 +999,14 @@ emitSPUpdate(MBB, MBBI, -8, /*InEpilogue=*/false); } + // On Darwin we could potentially need a stack probe if the stack size is over + // page size or if the function can potentially use a dynamic alloca. + // Otherwise we can retain the red-zone optimization. + bool AnyStackProbeNeeded = UseStackProbe; + if (STI.isTargetDarwin()) + AnyStackProbeNeeded &= + (StackSize > StackProbeSize || X86FI->hasDynAlloca()); + // If this is x86-64 and the Red Zone is not disabled, if we are a leaf // function, and use up to 128 bytes of stack space, don't have a frame // pointer, calls, or dynamic alloca then we do not need to adjust the @@ -1006,7 +1016,7 @@ !TRI->needsStackRealignment(MF) && !MFI.hasVarSizedObjects() && // No dynamic alloca. !MFI.adjustsStack() && // No calls. - !UseStackProbe && // No stack probes. + !AnyStackProbeNeeded && // No stack probes. !IsWin64CC && // Win64 has no Red Zone !MFI.hasCopyImplyingStackAdjustment() && // Don't push and pop. !MF.shouldSplitStack()) { // Regular stack @@ -1209,14 +1219,22 @@ bool isEAXAlive = isEAXLiveIn(MBB); if (isEAXAlive) { - // Sanity check that EAX is not livein for this function. - // It should not be, so throw an assert. - assert(!Is64Bit && "EAX is livein in x64 case!"); + if (!STI.isTargetDarwin()) { + // For the Win64 ABI EAX should not be livein for this function. + assert(!Is64Bit && "EAX is livein in Win64 case!"); - // Save EAX - BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r)) - .addReg(X86::EAX, RegState::Kill) - .setMIFlag(MachineInstr::FrameSetup); + // Save EAX + BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r)) + .addReg(X86::EAX, RegState::Kill) + .setMIFlag(MachineInstr::FrameSetup); + } else { + assert(Is64Bit && "Must be 64 bit Darwin"); + + // Save RAX + BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r)) + .addReg(X86::RAX, RegState::Kill) + .setMIFlag(MachineInstr::FrameSetup); + } } if (Is64Bit) { @@ -1243,14 +1261,20 @@ .setMIFlag(MachineInstr::FrameSetup); } - // Call __chkstk, __chkstk_ms, or __alloca. + // Call the platform-specific probing function. emitStackProbe(MF, MBB, MBBI, DL, true); if (isEAXAlive) { - // Restore EAX - MachineInstr *MI = - addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), X86::EAX), - StackPtr, false, NumBytes - 4); + MachineInstr *MI; + if (!STI.isTargetDarwin()) { + // Restore EAX + MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), X86::EAX), + StackPtr, false, NumBytes - 4); + } else { + // Restore RAX. + MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV64rm), X86::RAX), + StackPtr, false, NumBytes - 8); + } MI->setFlag(MachineInstr::FrameSetup); MBB.insert(MBBI, MI); } Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -549,8 +549,8 @@ // is needed so that this can be expanded with control flow. VASTART_SAVE_XMM_REGS, - // Windows's _chkstk call to do stack probing. - WIN_ALLOCA, + // Stack checking call to do stack probing on Windows and Darwin. + DYN_ALLOCA, // For allocating variable amounts of stack space when using // segmented stacks. Check if the current stacklet has enough space, and Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -86,6 +86,7 @@ "SHIFT, LEA, etc."), cl::Hidden); + /// Call this when the user attempts to do something unsupported, like /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike /// report_fatal_error, so calling code should attempt to recover without @@ -19196,8 +19197,8 @@ DAG.getRegister(Vreg, SPTy)); } else { SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); - Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size); - MF.getInfo()->setHasWinAlloca(true); + Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size); + MF.getInfo()->setHasDynAlloca(true); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); unsigned SPReg = RegInfo->getStackRegister(); @@ -25156,7 +25157,7 @@ case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; - case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; + case X86ISD::DYN_ALLOCA: return "X86ISD::DYN_ALLOCA"; case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; case X86ISD::MFENCE: return "X86ISD::MFENCE"; case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; @@ -38246,8 +38247,11 @@ if (MF.getFunction()->hasFnAttribute("probe-stack")) return MF.getFunction()->getFnAttribute("probe-stack").getValueAsString(); - // Generally, if we aren't on Windows, the platform ABI does not include - // support for stack probes, so don't emit them. + if (Subtarget.isTargetDarwin() && Subtarget.is64Bit() && + MF.getTarget().Options.EnableStackProbe) + return "___chkstk_darwin"; + + // Disable probing for EFI targets that are non-Darwin but use MachO. if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO()) return ""; Index: lib/Target/X86/X86InstrCompiler.td =================================================================== --- lib/Target/X86/X86InstrCompiler.td +++ lib/Target/X86/X86InstrCompiler.td @@ -122,24 +122,27 @@ Requires<[In64BitMode]>; } -// Dynamic stack allocation yields a _chkstk or _alloca call for all Windows -// targets. These calls are needed to probe the stack when allocating more than -// 4k bytes in one go. Touching the stack at 4K increments is necessary to +// Dynamic stack allocation yields a _chkstk or _alloca call for all Windows and +// Darwin targets. +// On Windows, these calls are needed to probe the stack when allocating more +// than 4k bytes in one go. Touching the stack at 4K increments is necessary to // ensure that the guard pages used by the OS virtual memory manager are // allocated in correct sequence. +// On Darwin, the calls are used to ensure that the stack guard page is hit for +// dynamic allocas. // The main point of having separate instruction are extra unmodelled effects // (compared to ordinary calls) like stack pointer change. let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in -def WIN_ALLOCA_32 : I<0, Pseudo, (outs), (ins GR32:$size), +def DYN_ALLOCA_32 : I<0, Pseudo, (outs), (ins GR32:$size), "# dynamic stack allocation", - [(X86WinAlloca GR32:$size)]>, + [(X86DynAlloca GR32:$size)]>, Requires<[NotLP64]>; let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in -def WIN_ALLOCA_64 : I<0, Pseudo, (outs), (ins GR64:$size), +def DYN_ALLOCA_64 : I<0, Pseudo, (outs), (ins GR64:$size), "# dynamic stack allocation", - [(X86WinAlloca GR64:$size)]>, + [(X86DynAlloca GR64:$size)]>, Requires<[In64BitMode]>; Index: lib/Target/X86/X86InstrInfo.td =================================================================== --- lib/Target/X86/X86InstrInfo.td +++ lib/Target/X86/X86InstrInfo.td @@ -116,7 +116,7 @@ def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisInt<0>]>; -def SDT_X86WIN_ALLOCA : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>; +def SDT_X86DYN_ALLOCA : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>; def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>; @@ -283,7 +283,7 @@ def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>; -def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDT_X86WIN_ALLOCA, +def X86DynAlloca : SDNode<"X86ISD::DYN_ALLOCA", SDT_X86DYN_ALLOCA, [SDNPHasChain, SDNPOutGlue]>; def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA, Index: lib/Target/X86/X86MachineFunctionInfo.h =================================================================== --- lib/Target/X86/X86MachineFunctionInfo.h +++ lib/Target/X86/X86MachineFunctionInfo.h @@ -98,8 +98,8 @@ /// True if this function uses the red zone. bool UsesRedZone = false; - /// True if this function has WIN_ALLOCA instructions. - bool HasWinAlloca = false; + /// True if this function has DYN_ALLOCA instructions. + bool HasDynAlloca = false; private: /// ForwardedMustTailRegParms - A list of virtual and physical registers @@ -176,8 +176,8 @@ bool getUsesRedZone() const { return UsesRedZone; } void setUsesRedZone(bool V) { UsesRedZone = V; } - bool hasWinAlloca() const { return HasWinAlloca; } - void setHasWinAlloca(bool v) { HasWinAlloca = v; } + bool hasDynAlloca() const { return HasDynAlloca; } + void setHasDynAlloca(bool v) { HasDynAlloca = v; } }; } // End llvm namespace Index: lib/Target/X86/X86TargetMachine.cpp =================================================================== --- lib/Target/X86/X86TargetMachine.cpp +++ lib/Target/X86/X86TargetMachine.cpp @@ -410,7 +410,7 @@ addPass(createX86CallFrameOptimization()); } - addPass(createX86WinAllocaExpander()); + addPass(createX86DynAllocaExpander()); } void X86PassConfig::addMachineSSAOptimization() { addPass(createX86DomainReassignmentPass()); Index: test/CodeGen/X86/O0-pipeline.ll =================================================================== --- test/CodeGen/X86/O0-pipeline.ll +++ test/CodeGen/X86/O0-pipeline.ll @@ -36,7 +36,7 @@ ; CHECK-NEXT: X86 PIC Global Base Reg Initialization ; CHECK-NEXT: Expand ISel Pseudo-instructions ; CHECK-NEXT: Local Stack Slot Allocation -; CHECK-NEXT: X86 WinAlloca Expander +; CHECK-NEXT: X86 DynAlloca Expander ; CHECK-NEXT: Eliminate PHI nodes for register allocation ; CHECK-NEXT: Two-Address instruction pass ; CHECK-NEXT: Fast Register Allocator Index: test/CodeGen/X86/darwin-stack-probing.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/darwin-stack-probing.ll @@ -0,0 +1,53 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -verify-machineinstrs -stack-probe | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -verify-machineinstrs | FileCheck --check-prefix=DISABLED %s +target triple = "x86_64-apple-darwin" + +declare i32 @use_ptr(i32*) + +; Expect a probe here due to static object size > 4096. +; Function Attrs: noinline nounwind optnone uwtable +define void @static_test1_probe() { +; CHECK-LABEL: static_test1_probe: +; CHECK: movl $16392, %eax +; CHECK-NEXT: callq ____chkstk_darwin +; CHECK: subq %rax, %rsp +; +; DISABLED-LABEL: static_test1_probe: +; DISABLED-NOT: callq ____chkstk_darwin + %1 = alloca [4096 x i32], align 4 + %2 = getelementptr inbounds [4096 x i32], [4096 x i32]* %1, i32 0, i32 0 + %3 = call i32 @use_ptr(i32* %2) + ret void +} + +; Stack size should be less than 4k, no probe. +; Function Attrs: noinline nounwind optnone uwtable +define void @static_test2_small() #0 { +; CHECK-LABEL: static_test2_small: +; CHECK-NOT: callq ____chkstk_darwin + %1 = alloca [64 x i32], align 4 + %2 = getelementptr inbounds [64 x i32], [64 x i32]* %1, i32 0, i32 0 + %3 = call i32 @use_ptr(i32* %2) + ret void +} + +@g = common local_unnamed_addr global i32* null, align 8 + +; Test dynamic sized allocas. +; Function Attrs: nounwind optsize ssp uwtable +define void @test_dynamic(i32* nocapture readnone, i64 %num) local_unnamed_addr { +; CHECK-LABEL: test_dynamic: +; CHECK: pushq %rbp +; CHECK: movq %rsp, %rbp +; CHECK: leaq 15(%rsi), %rax +; CHECK: andq $-16, %rax +; CHECK: callq ____chkstk_darwin +; CHECK: subq %rax, %rsp +; CHECK: movq %rsp, %rax + %2 = alloca i8, i64 %num, align 16 + %3 = bitcast i8* %2 to i32* + store i32* %3, i32** @g, align 8 + ret void +} + +