Index: include/llvm/CodeGen/MachineFunction.h =================================================================== --- include/llvm/CodeGen/MachineFunction.h +++ include/llvm/CodeGen/MachineFunction.h @@ -265,6 +265,9 @@ /// Should we be emitting segmented stack stuff for the function bool shouldSplitStack(); + /// Should we be probing the stack for the function + bool shouldProbeStack(); + /// getNumBlockIDs - Return the number of MBB ID's allocated. /// unsigned getNumBlockIDs() const { return (unsigned)MBBNumbering.size(); } Index: lib/CodeGen/MachineFunction.cpp =================================================================== --- lib/CodeGen/MachineFunction.cpp +++ lib/CodeGen/MachineFunction.cpp @@ -134,6 +134,11 @@ return getFunction()->hasFnAttribute("split-stack"); } +/// Should we be probing the stack for the function +bool MachineFunction::shouldProbeStack() { + return getFunction()->hasFnAttribute("probe-stack"); +} + /// RenumberBlocks - This discards all of the MachineBasicBlock numbers and /// recomputes them. This guarantees that the MBB numbers are sequential, /// dense, and match the ordering of the blocks within the function. If a Index: lib/Target/ARM/ARMFrameLowering.cpp =================================================================== --- lib/Target/ARM/ARMFrameLowering.cpp +++ lib/Target/ARM/ARMFrameLowering.cpp @@ -299,7 +299,8 @@ } else NumBytes = DPRCSOffset; - if (STI.isTargetWindows() && WindowsRequiresStackProbe(MF, NumBytes)) { + if ((STI.isTargetWindows() || MF.shouldProbeStack()) + && WindowsRequiresStackProbe(MF, NumBytes)) { uint32_t NumWords = NumBytes >> 2; if (NumWords < 65536) @@ -311,6 +312,9 @@ .addImm(NumWords) .setMIFlags(MachineInstr::FrameSetup); + const char *StackProbeSymbol = STI.isTargetWindows() ? + "__chkstk" : "__probestack"; + switch (TM.getCodeModel()) { case CodeModel::Small: case CodeModel::Medium: @@ -318,14 +322,14 @@ case CodeModel::Kernel: BuildMI(MBB, MBBI, dl, TII.get(ARM::tBL)) .addImm((unsigned)ARMCC::AL).addReg(0) - .addExternalSymbol("__chkstk") + .addExternalSymbol(StackProbeSymbol) .addReg(ARM::R4, RegState::Implicit) .setMIFlags(MachineInstr::FrameSetup); break; case CodeModel::Large: case CodeModel::JITDefault: BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi32imm), ARM::R12) - .addExternalSymbol("__chkstk") + .addExternalSymbol(StackProbeSymbol) .setMIFlags(MachineInstr::FrameSetup); BuildMI(MBB, MBBI, dl, TII.get(ARM::tBLXr)) Index: lib/Target/X86/X86FrameLowering.cpp =================================================================== --- lib/Target/X86/X86FrameLowering.cpp +++ lib/Target/X86/X86FrameLowering.cpp @@ -482,6 +482,10 @@ X86FI->setCalleeSavedFrameSize( X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta); + bool UseRedZone = false; + bool UseStackProbe = (STI.isOSWindows() && !STI.isTargetMacho()) || + MF.shouldProbeStack(); + // If this is x86-64 and the Red Zone is not disabled, if we are a leaf // function, and use up to 128 bytes of stack space, don't have a frame // pointer, calls, or dynamic alloca then we do not need to adjust the @@ -493,12 +497,14 @@ !MFI->hasVarSizedObjects() && // No dynamic alloca. !MFI->adjustsStack() && // No calls. !IsWin64 && // Win64 has no Red Zone + !(UseStackProbe && StackSize > 128) && // No stack probes !usesTheStack(MF) && // Don't push and pop. !MF.shouldSplitStack()) { // Regular stack uint64_t MinSize = X86FI->getCalleeSavedFrameSize(); if (HasFP) MinSize += SlotSize; StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0); MFI->setStackSize(StackSize); + UseRedZone = true; } // Insert stack pointer adjustment for later moving of return addr. Only @@ -663,71 +669,94 @@ // responsible for adjusting the stack pointer. Touching the stack at 4K // increments is necessary to ensure that the guard pages used by the OS // virtual memory manager are allocated in correct sequence. - if (NumBytes >= 4096 && STI.isOSWindows() && !STI.isTargetMacho()) { - const char *StackProbeSymbol; + if (NumBytes >= 4096 && UseStackProbe) { + assert(!UseRedZone && "The Red Zone is not accounted for in stack probes"); - if (Is64Bit) { - if (STI.isTargetCygMing()) { - StackProbeSymbol = "___chkstk_ms"; + if (NumBytes <= 0x5000) { + BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::SUB64ri32 : X86::SUB32ri), + StackPtr) + .addReg(StackPtr) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + + for (uint64_t i = 0; i < NumBytes / 0x1000; ++i) { + BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::OR64mi8 : X86::OR32mi8)) + .addReg(StackPtr) + .addImm(1) + .addReg(0) + .addImm(NumBytes - (i + 1) * 0x1000) + .addReg(0) + .addImm(0) + .setMIFlag(MachineInstr::FrameSetup); + } + } else { + const char *StackProbeSymbol; + + if (STI.isOSWindows()) { + if (Is64Bit) { + if (STI.isTargetCygMing()) { + StackProbeSymbol = "___chkstk_ms"; + } else { + StackProbeSymbol = "__chkstk"; + } + } else if (STI.isTargetCygMing()) + StackProbeSymbol = "_alloca"; + else + StackProbeSymbol = "_chkstk"; } else { - StackProbeSymbol = "__chkstk"; + StackProbeSymbol = "__probestack"; } - } else if (STI.isTargetCygMing()) - StackProbeSymbol = "_alloca"; - else - StackProbeSymbol = "_chkstk"; - // Check whether EAX is livein for this function. - bool isEAXAlive = isEAXLiveIn(MF); + // Check whether the accumulator register is livein for this function. + bool isRegAccAlive = isEAXLiveIn(MF); + auto RegAcc = Is64Bit ? X86::RAX : X86::EAX; - if (isEAXAlive) { - // Sanity check that EAX is not livein for this function. - // It should not be, so throw an assert. - assert(!Is64Bit && "EAX is livein in x64 case!"); + if (isRegAccAlive) { + // Save RegAcc + BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r)) + .addReg(RegAcc, RegState::Kill) + .setMIFlag(MachineInstr::FrameSetup); + } - // Save EAX - BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r)) - .addReg(X86::EAX, RegState::Kill) - .setMIFlag(MachineInstr::FrameSetup); - } + uint64_t NumBytesAdj = isRegAccAlive ? NumBytes - (Is64Bit ? 8 : 4) : + NumBytes; - if (Is64Bit) { - // Handle the 64-bit Windows ABI case where we need to call __chkstk. - // Function prologue is responsible for adjusting the stack pointer. - BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX) - .addImm(NumBytes) + // Allocate NumBytesAdj bytes on stack in case of isRegAccAlive. + // We'll also use 8/4 already allocated bytes for EAX. + BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::MOV64ri : X86::MOV32ri), + RegAcc) + .addImm(NumBytesAdj) .setMIFlag(MachineInstr::FrameSetup); - } else { - // Allocate NumBytes-4 bytes on stack in case of isEAXAlive. - // We'll also use 4 already allocated bytes for EAX. - BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) - .addImm(isEAXAlive ? NumBytes - 4 : NumBytes) - .setMIFlag(MachineInstr::FrameSetup); - } - BuildMI(MBB, MBBI, DL, - TII.get(Is64Bit ? X86::W64ALLOCA : X86::CALLpcrel32)) - .addExternalSymbol(StackProbeSymbol) - .addReg(StackPtr, RegState::Define | RegState::Implicit) - .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit) - .setMIFlag(MachineInstr::FrameSetup); - - if (Is64Bit) { - // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp - // themself. It also does not clobber %rax so we can reuse it when - // adjusting %rsp. - BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), StackPtr) - .addReg(StackPtr) - .addReg(X86::RAX) + auto CallOp = Is64Bit ? (STI.isOSWindows() ? X86::W64ALLOCA : + X86::CALL64pcrel32) : + X86::CALLpcrel32; + BuildMI(MBB, MBBI, DL, + TII.get(CallOp)) + .addExternalSymbol(StackProbeSymbol) + .addReg(StackPtr, RegState::Define | RegState::Implicit) + .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit) .setMIFlag(MachineInstr::FrameSetup); - } - if (isEAXAlive) { - // Restore EAX - MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), - X86::EAX), - StackPtr, false, NumBytes - 4); - MI->setFlag(MachineInstr::FrameSetup); - MBB.insert(MBBI, MI); + + if (Is64Bit || !STI.isOSWindows()) { + // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp + // themself. It also does not clobber %rax so we can reuse it when + // adjusting %rsp. + BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::SUB64rr : X86::SUB32rr), + StackPtr) + .addReg(StackPtr) + .addReg(RegAcc) + .setMIFlag(MachineInstr::FrameSetup); + } + if (isRegAccAlive) { + // Restore RegAcc + auto MIB = BuildMI(MF, DL, + TII.get(Is64Bit ? X86::MOV64rm : X86::MOV32rm), + RegAcc); + MachineInstr *MI = addRegOffset(MIB, StackPtr, false, NumBytesAdj); + MI->setFlag(MachineInstr::FrameSetup); + MBB.insert(MBBI, MI); + } } } else if (NumBytes) { emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, IsLP64, Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -13682,7 +13682,7 @@ MachineFunction &MF = DAG.getMachineFunction(); bool SplitStack = MF.shouldSplitStack(); bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMacho()) || - SplitStack; + SplitStack || MF.shouldProbeStack(); SDLoc dl(Op); if (!Lower) { @@ -18198,7 +18198,7 @@ // The lowering is pretty easy: we're just emitting the call to _alloca. The // non-trivial part is impdef of ESP. - if (Subtarget->isTargetWin64()) { + if (Subtarget->isTargetWin64() || !Subtarget->isOSWindows()) { if (Subtarget->isTargetCygMing()) { // ___chkstk(Mingw64): // Clobbers R10, R11, RAX and EFLAGS. @@ -18211,16 +18211,29 @@ .addReg(X86::RSP, RegState::Define | RegState::Implicit) .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); } else { - // __chkstk(MSVCRT): does not update stack pointer. - // Clobbers R10, R11 and EFLAGS. - BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) - .addExternalSymbol("__chkstk") - .addReg(X86::RAX, RegState::Implicit) - .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); - // RAX has the offset to be subtracted from RSP. - BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP) - .addReg(X86::RSP) - .addReg(X86::RAX); + const char *StackProbeSymbol = + Subtarget->isOSWindows() ? "__chkstk" : "__probestack"; + if (Subtarget->is64Bit()) { + // __chkstk(MSVCRT): does not update stack pointer. + // Clobbers R10, R11 and EFLAGS. + BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) + .addExternalSymbol(StackProbeSymbol) + .addReg(X86::RAX, RegState::Implicit) + .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); + // RAX has the offset to be subtracted from RSP. + BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP) + .addReg(X86::RSP) + .addReg(X86::RAX); + } else { + BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) + .addExternalSymbol(StackProbeSymbol) + .addReg(X86::EAX, RegState::Implicit) + .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); + // EAX has the offset to be subtracted from ESP. + BuildMI(*BB, MI, DL, TII->get(X86::SUB32rr), X86::ESP) + .addReg(X86::ESP) + .addReg(X86::EAX); + } } } else { const char *StackProbeSymbol = Index: lib/Transforms/IPO/Inliner.cpp =================================================================== --- lib/Transforms/IPO/Inliner.cpp +++ lib/Transforms/IPO/Inliner.cpp @@ -137,6 +137,10 @@ AdjustCallerSSPLevel(Caller, Callee); + if (Callee->hasFnAttribute("probe-stack")) { + Caller->addFnAttr("probe-stack", ""); + } + // Look at all of the allocas that we inlined through this call site. If we // have already inlined other allocas through other calls into this function, // then we know that they have disjoint lifetimes and that we can merge them. Index: test/CodeGen/X86/mingw-alloca.ll =================================================================== --- test/CodeGen/X86/mingw-alloca.ll +++ test/CodeGen/X86/mingw-alloca.ll @@ -22,14 +22,14 @@ ; COFF: andl $-16, %esp ; COFF: pushl %eax ; COFF: calll __alloca -; COFF: movl 8028(%esp), %eax +; COFF: movl 80028(%esp), %eax ; ELF: foo2: ; ELF: andl $-16, %esp ; ELF: pushl %eax ; ELF: calll _alloca -; ELF: movl 8028(%esp), %eax - %A2 = alloca [2000 x i32], align 16 ; <[2000 x i32]*> [#uses=1] - %A2.sub = getelementptr [2000 x i32]* %A2, i32 0, i32 0 ; [#uses=1] +; ELF: movl 80028(%esp), %eax + %A2 = alloca [20000 x i32], align 16 ; <[20000 x i32]*> [#uses=1] + %A2.sub = getelementptr [20000 x i32]* %A2, i32 0, i32 0 ; [#uses=1] call void @bar2( i32* %A2.sub, i32 %N ) ret void } Index: test/CodeGen/X86/pr17631.ll =================================================================== --- test/CodeGen/X86/pr17631.ll +++ test/CodeGen/X86/pr17631.ll @@ -18,7 +18,7 @@ ; CHECK: equal ; CHECK-NOT: vzeroupper -; CHECK: _chkstk +; CHECK: orl $0, 64(%esp) ; CHECK: ret define <8 x float> @foo(<8 x float> %y, i64* %p, double %x) { Index: test/CodeGen/X86/stack-probes.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/stack-probes.ll @@ -0,0 +1,37 @@ +; RUN: llc -mtriple=i386-pc-linux-gnu < %s -o - | FileCheck --check-prefix=X86-Linux %s +; RUN: llc -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck --check-prefix=X64-Linux %s + +declare void @use([40096 x i8]*) + +define void @test() "probe-stack" { + %array = alloca [40096 x i8], align 16 + call void @use([40096 x i8]* %array) + ret void + +; X86-Linux-LABEL: test: +; X86-Linux: movl $40124, %eax +; X86-Linux-NEXT: calll __probestack +; X86-Linux-NEXT: subl %eax, %esp + +; X64-Linux-LABEL: test: +; X64-Linux: movabsq $40104, %rax +; X64-Linux-NEXT: callq __probestack +; X64-Linux-NEXT: subq %rax, %rsp + +} + +declare void @use_fast([4096 x i8]*) + +define void @test_fast() "probe-stack" { + %array = alloca [4096 x i8], align 16 + call void @use_fast([4096 x i8]* %array) + ret void + +; X86-Linux-LABEL: test_fast: +; X86-Linux: subl $4124, %esp +; X86-Linux-NEXT: orl $0, 28(%esp) + +; X64-Linux-LABEL: test_fast: +; X64-Linux: subq $4104, %rsp +; X64-Linux-NEXT: orq $0, 8(%rsp) +} Index: test/CodeGen/X86/win64_alloca_dynalloca.ll =================================================================== --- test/CodeGen/X86/win64_alloca_dynalloca.ll +++ test/CodeGen/X86/win64_alloca_dynalloca.ll @@ -10,23 +10,23 @@ ; EFI-LABEL: unaligned: entry: - %buf0 = alloca i8, i64 4096, align 1 + %buf0 = alloca i8, i64 40096, align 1 ; ___chkstk_ms does not adjust %rsp. ; M64: movq %rsp, %rbp -; M64: $4096, %rax +; M64: $40096, %rax ; M64: callq ___chkstk_ms ; M64: subq %rax, %rsp ; __chkstk does not adjust %rsp. ; W64: movq %rsp, %rbp -; W64: $4096, %rax +; W64: $40096, %rax ; W64: callq __chkstk ; W64: subq %rax, %rsp ; Freestanding ; EFI: movq %rsp, %rbp -; EFI: $[[B0OFS:4096|4104]], %rsp +; EFI: $[[B0OFS:40096|40104]], %rsp ; EFI-NOT: call %buf1 = alloca i8, i64 %n, align 1 @@ -53,12 +53,12 @@ ; M64: subq $48, %rsp ; M64: movq %rax, 32(%rsp) -; M64: leaq -4096(%rbp), %r9 +; M64: leaq -40096(%rbp), %r9 ; M64: callq bar ; W64: subq $48, %rsp ; W64: movq %rax, 32(%rsp) -; W64: leaq -4096(%rbp), %r9 +; W64: leaq -40096(%rbp), %r9 ; W64: callq bar ; EFI: subq $48, %rsp Index: test/CodeGen/X86/win64_eh.ll =================================================================== --- test/CodeGen/X86/win64_eh.ll +++ test/CodeGen/X86/win64_eh.ll @@ -30,17 +30,17 @@ ; Checks a stack allocation requiring call to __chkstk/___chkstk_ms define void @foo2() uwtable { entry: - %baz = alloca [4000 x i16], align 2 + %baz = alloca [40000 x i16], align 2 ret void } ; WIN64-LABEL: foo2: ; WIN64: .seh_proc foo2 -; WIN64: movabsq $8000, %rax +; WIN64: movabsq $80000, %rax ; WIN64: callq {{__chkstk|___chkstk_ms}} ; WIN64: subq %rax, %rsp -; WIN64: .seh_stackalloc 8000 +; WIN64: .seh_stackalloc 80000 ; WIN64: .seh_endprologue -; WIN64: addq $8000, %rsp +; WIN64: addq $80000, %rsp ; WIN64: ret ; WIN64: .seh_endproc Index: test/CodeGen/X86/win_chkstk.ll =================================================================== --- test/CodeGen/X86/win_chkstk.ll +++ test/CodeGen/X86/win_chkstk.ll @@ -19,7 +19,7 @@ ; MINGW_X32: calll __alloca ; MINGW_X64: callq ___chkstk_ms ; LINUX-NOT: call __chkstk - %array4096 = alloca [4096 x i8], align 16 ; <[4096 x i8]*> [#uses=0] + %array4096 = alloca [40096 x i8], align 16 ; <[40096 x i8]*> [#uses=0] ret i32 0 } @@ -55,6 +55,6 @@ ; MINGW_X32: calll __alloca ; MINGW_X64: callq ___chkstk_ms ; LINUX-NOT: call __chkstk - %array4096 = alloca [4096 x i8], align 16 ; <[4096 x i8]*> [#uses=0] + %array4096 = alloca [40096 x i8], align 16 ; <[40096 x i8]*> [#uses=0] ret i32 0 }