Index: lib/Target/X86/X86FrameLowering.cpp =================================================================== --- lib/Target/X86/X86FrameLowering.cpp +++ lib/Target/X86/X86FrameLowering.cpp @@ -688,63 +688,88 @@ // increments is necessary to ensure that the guard pages used by the OS // virtual memory manager are allocated in correct sequence. if (NumBytes >= PageSize && UseStackProbe) { - const char *StackProbeSymbol; - unsigned CallOp; - getStackProbeFunction(STI, CallOp, StackProbeSymbol); - - // Check whether EAX is livein for this function. - bool isEAXAlive = isEAXLiveIn(MF); - - if (isEAXAlive) { - // Sanity check that EAX is not livein for this function. - // It should not be, so throw an assert. - assert(!Is64Bit && "EAX is livein in x64 case!"); - - // Save EAX - BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r)) - .addReg(X86::EAX, RegState::Kill) + // As an optimization, if we only need to probe 5 pages or below, we just + // emit instructions to do that instead of calling the function. This is + // just what the loop in the called function would do. The threashold of 5 + // probes was picked based on what GCC does. Since we're not calling the + // function, we need to adjust the stack pointer ourselves. + if (NumBytes <= 5 * PageSize) { + BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::SUB64ri32 : X86::SUB32ri), + StackPtr) + .addReg(StackPtr) + .addImm(NumBytes) .setMIFlag(MachineInstr::FrameSetup); - } - if (Is64Bit) { - // Handle the 64-bit Windows ABI case where we need to call __chkstk. - // Function prologue is responsible for adjusting the stack pointer. - BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX) - .addImm(NumBytes) - .setMIFlag(MachineInstr::FrameSetup); + for (uint64_t i = 0; i < NumBytes / PageSize; ++i) { + BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::OR64mi8 : X86::OR32mi8)) + .addReg(StackPtr) + .addImm(1) + .addReg(0) + .addImm(NumBytes - (i + 1) * PageSize) + .addReg(0) + .addImm(0) + .setMIFlag(MachineInstr::FrameSetup); + } } else { - // Allocate NumBytes-4 bytes on stack in case of isEAXAlive. - // We'll also use 4 already allocated bytes for EAX. - BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) - .addImm(isEAXAlive ? NumBytes - 4 : NumBytes) - .setMIFlag(MachineInstr::FrameSetup); - } + const char *StackProbeSymbol; + unsigned CallOp; + + getStackProbeFunction(STI, CallOp, StackProbeSymbol); + + // Check whether EAX is livein for this function. + bool isEAXAlive = isEAXLiveIn(MF); + + if (isEAXAlive) { + // Sanity check that EAX is not livein for this function. + // It should not be, so throw an assert. + assert(!Is64Bit && "EAX is livein in x64 case!"); + + // Save EAX + BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r)) + .addReg(X86::EAX, RegState::Kill) + .setMIFlag(MachineInstr::FrameSetup); + } - BuildMI(MBB, MBBI, DL, - TII.get(CallOp)) - .addExternalSymbol(StackProbeSymbol) - .addReg(StackPtr, RegState::Define | RegState::Implicit) - .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit) - .setMIFlag(MachineInstr::FrameSetup); + if (Is64Bit) { + // Handle the 64-bit Windows ABI case where we need to call __chkstk. + // Function prologue is responsible for adjusting the stack pointer. + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + } else { + // Allocate NumBytes-4 bytes on stack in case of isEAXAlive. + // We'll also use 4 already allocated bytes for EAX. + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) + .addImm(isEAXAlive ? NumBytes - 4 : NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + } - if (Is64Bit) { - // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp - // themself. It also does not clobber %rax so we can reuse it when - // adjusting %rsp. - BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), StackPtr) - .addReg(StackPtr) - .addReg(X86::RAX) + BuildMI(MBB, MBBI, DL, + TII.get(CallOp)) + .addExternalSymbol(StackProbeSymbol) + .addReg(StackPtr, RegState::Define | RegState::Implicit) + .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit) .setMIFlag(MachineInstr::FrameSetup); + + if (Is64Bit) { + // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp + // themself. It also does not clobber %rax so we can reuse it when + // adjusting %rsp. + BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), StackPtr) + .addReg(StackPtr) + .addReg(X86::RAX) + .setMIFlag(MachineInstr::FrameSetup); + } + if (isEAXAlive) { + // Restore EAX + MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), + X86::EAX), + StackPtr, false, NumBytes - 4); + MI->setFlag(MachineInstr::FrameSetup); + MBB.insert(MBBI, MI); + } } - if (isEAXAlive) { - // Restore EAX - MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), - X86::EAX), - StackPtr, false, NumBytes - 4); - MI->setFlag(MachineInstr::FrameSetup); - MBB.insert(MBBI, MI); - } } else if (NumBytes) { emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, Uses64BitFramePtr, UseLEA, TII, *RegInfo); Index: test/CodeGen/X86/mingw-alloca.ll =================================================================== --- test/CodeGen/X86/mingw-alloca.ll +++ test/CodeGen/X86/mingw-alloca.ll @@ -22,14 +22,14 @@ ; COFF: andl $-16, %esp ; COFF: pushl %eax ; COFF: calll __alloca -; COFF: movl 8028(%esp), %eax +; COFF: movl 80028(%esp), %eax ; ELF: foo2: ; ELF: andl $-16, %esp ; ELF: pushl %eax ; ELF: calll _alloca -; ELF: movl 8028(%esp), %eax - %A2 = alloca [2000 x i32], align 16 ; <[2000 x i32]*> [#uses=1] - %A2.sub = getelementptr [2000 x i32]* %A2, i32 0, i32 0 ; [#uses=1] +; ELF: movl 80028(%esp), %eax + %A2 = alloca [20000 x i32], align 16 ; <[20000 x i32]*> [#uses=1] + %A2.sub = getelementptr [20000 x i32]* %A2, i32 0, i32 0 ; [#uses=1] call void @bar2( i32* %A2.sub, i32 %N ) ret void } Index: test/CodeGen/X86/pr17631.ll =================================================================== --- test/CodeGen/X86/pr17631.ll +++ test/CodeGen/X86/pr17631.ll @@ -18,7 +18,7 @@ ; CHECK: equal ; CHECK-NOT: vzeroupper -; CHECK: _chkstk +; CHECK: orl $0, 64(%esp) ; CHECK: ret define <8 x float> @foo(<8 x float> %y, i64* %p, double %x) { Index: test/CodeGen/X86/win64_alloca_dynalloca.ll =================================================================== --- test/CodeGen/X86/win64_alloca_dynalloca.ll +++ test/CodeGen/X86/win64_alloca_dynalloca.ll @@ -10,23 +10,23 @@ ; EFI-LABEL: unaligned: entry: - %buf0 = alloca i8, i64 4096, align 1 + %buf0 = alloca i8, i64 40096, align 1 ; ___chkstk_ms does not adjust %rsp. ; M64: movq %rsp, %rbp -; M64: $4096, %rax +; M64: $40096, %rax ; M64: callq ___chkstk_ms ; M64: subq %rax, %rsp ; __chkstk does not adjust %rsp. ; W64: movq %rsp, %rbp -; W64: $4096, %rax +; W64: $40096, %rax ; W64: callq __chkstk ; W64: subq %rax, %rsp ; Freestanding ; EFI: movq %rsp, %rbp -; EFI: $[[B0OFS:4096|4104]], %rsp +; EFI: $[[B0OFS:40096|40104]], %rsp ; EFI-NOT: call %buf1 = alloca i8, i64 %n, align 1 @@ -53,12 +53,12 @@ ; M64: subq $48, %rsp ; M64: movq %rax, 32(%rsp) -; M64: leaq -4096(%rbp), %r9 +; M64: leaq -40096(%rbp), %r9 ; M64: callq bar ; W64: subq $48, %rsp ; W64: movq %rax, 32(%rsp) -; W64: leaq -4096(%rbp), %r9 +; W64: leaq -40096(%rbp), %r9 ; W64: callq bar ; EFI: subq $48, %rsp Index: test/CodeGen/X86/win64_eh.ll =================================================================== --- test/CodeGen/X86/win64_eh.ll +++ test/CodeGen/X86/win64_eh.ll @@ -30,21 +30,39 @@ ; Checks a stack allocation requiring call to __chkstk/___chkstk_ms define void @foo2() uwtable { entry: - %baz = alloca [4000 x i16], align 2 + %baz = alloca [40000 x i16], align 2 ret void } ; WIN64-LABEL: foo2: ; WIN64: .seh_proc foo2 -; WIN64: movabsq $8000, %rax +; WIN64: movabsq $80000, %rax ; WIN64: callq {{__chkstk|___chkstk_ms}} ; WIN64: subq %rax, %rsp -; WIN64: .seh_stackalloc 8000 +; WIN64: .seh_stackalloc 80000 ; WIN64: .seh_endprologue -; WIN64: addq $8000, %rsp +; WIN64: addq $80000, %rsp ; WIN64: ret ; WIN64: .seh_endproc +; Checks an optimization which avoids calls to __chkstk by directly +; probing for mid-sized frames +define void @foo2_touch() uwtable { +entry: + %baz = alloca [4096 x i8], align 2 + ret void +} +; WIN64-LABEL: foo2_touch: +; WIN64: .seh_proc foo2_touch +; WIN64: subq $4096, %rsp +; WIN64: orq $0, (%rsp) +; WIN64: .seh_stackalloc 4096 +; WIN64: .seh_endprologue +; WIN64: addq $4096, %rsp +; WIN64: ret +; WIN64: .seh_endproc + + ; Checks stack push define i32 @foo3(i32 %f_arg, i32 %e_arg, i32 %d_arg, i32 %c_arg, i32 %b_arg, i32 %a_arg) uwtable { entry: Index: test/CodeGen/X86/win_chkstk.ll =================================================================== --- test/CodeGen/X86/win_chkstk.ll +++ test/CodeGen/X86/win_chkstk.ll @@ -19,7 +19,7 @@ ; MINGW_X32: calll __alloca ; MINGW_X64: callq ___chkstk_ms ; LINUX-NOT: call __chkstk - %array4096 = alloca [4096 x i8], align 16 ; <[4096 x i8]*> [#uses=0] + %array4096 = alloca [40096 x i8], align 16 ; <[40096 x i8]*> [#uses=0] ret i32 0 } @@ -55,6 +55,6 @@ ; MINGW_X32: calll __alloca ; MINGW_X64: callq ___chkstk_ms ; LINUX-NOT: call __chkstk - %array4096 = alloca [4096 x i8], align 16 ; <[4096 x i8]*> [#uses=0] + %array4096 = alloca [40096 x i8], align 16 ; <[40096 x i8]*> [#uses=0] ret i32 0 }