Index: llvm/lib/Target/X86/X86FrameLowering.h =================================================================== --- llvm/lib/Target/X86/X86FrameLowering.h +++ llvm/lib/Target/X86/X86FrameLowering.h @@ -213,14 +213,14 @@ void emitStackProbeInlineGenericBlock(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, - uint64_t Offset) const; + const DebugLoc &DL, uint64_t Offset, + uint64_t Align) const; void emitStackProbeInlineGenericLoop(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, - uint64_t Offset) const; + const DebugLoc &DL, uint64_t Offset, + uint64_t Align) const; /// Emit a stub to later inline the target stack probe. MachineInstr *emitStackProbeInlineStub(MachineFunction &MF, Index: llvm/lib/Target/X86/X86FrameLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86FrameLowering.cpp +++ llvm/lib/Target/X86/X86FrameLowering.cpp @@ -586,28 +586,49 @@ const uint64_t StackProbeSize = TLI.getStackProbeSize(MF); uint64_t ProbeChunk = StackProbeSize * 8; + uint64_t MaxAlign = calculateMaxStackAlign(MF); + // Synthesize a loop or unroll it, depending on the number of iterations. - if (Offset > ProbeChunk) { - emitStackProbeInlineGenericLoop(MF, MBB, MBBI, DL, Offset); + if (Offset > ProbeChunk || MaxAlign > ProbeChunk) { + emitStackProbeInlineGenericLoop(MF, MBB, MBBI, DL, Offset, MaxAlign); } else { - emitStackProbeInlineGenericBlock(MF, MBB, MBBI, DL, Offset); + emitStackProbeInlineGenericBlock(MF, MBB, MBBI, DL, Offset, MaxAlign); } } void X86FrameLowering::emitStackProbeInlineGenericBlock( MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, const DebugLoc &DL, - uint64_t Offset) const { + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, uint64_t Offset, + uint64_t Align) const { const X86Subtarget &STI = MF.getSubtarget(); const X86TargetLowering &TLI = *STI.getTargetLowering(); const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset); const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi; const uint64_t StackProbeSize = TLI.getStackProbeSize(MF); + uint64_t CurrentOffset = 0; - // 0 Thanks to return address being saved on the stack uint64_t CurrentProbeOffset = 0; + if (CurrentOffset + StackProbeSize < Offset) { + assert(Align < StackProbeSize && + "Should be an emitStackProbeInlineGenericLoop"); + + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) + .addReg(StackPtr) + .addImm(StackProbeSize - Align) + .setMIFlag(MachineInstr::FrameSetup); + MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. + + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc)) + .setMIFlag(MachineInstr::FrameSetup), + StackPtr, false, 0) + .addImm(0) + .setMIFlag(MachineInstr::FrameSetup); + NumFrameExtraProbe++; + CurrentOffset = StackProbeSize - Align; + } + // For the first N - 1 pages, just probe. I tried to take advantage of // natural probes but it implies much more logic and there was very few // interesting natural probes to interleave. @@ -639,8 +660,8 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop( MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, const DebugLoc &DL, - uint64_t Offset) const { + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, uint64_t Offset, + uint64_t Align) const { assert(Offset && "null offset"); const X86Subtarget &STI = MF.getSubtarget(); @@ -648,6 +669,15 @@ const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi; const uint64_t StackProbeSize = TLI.getStackProbeSize(MF); + if (Align) { + const unsigned ADDOpc = getADDriOpcode(Uses64BitFramePtr, Align); + BuildMI(MBB, MBBI, DL, TII.get(ADDOpc), StackPtr) + .addReg(StackPtr) + .addImm(Align) + .setMIFlag(MachineInstr::FrameSetup); + Offset += Align; + } + // Synthesize a loop NumFrameLoopProbe++; const BasicBlock *LLVM_BB = MBB.getBasicBlock(); @@ -666,8 +696,8 @@ // save loop bound { - const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset); - BuildMI(MBB, MBBI, DL, TII.get(Opc), FinalStackProbed) + const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, Offset); + BuildMI(MBB, MBBI, DL, TII.get(SUBOpc), FinalStackProbed) .addReg(FinalStackProbed) .addImm(Offset / StackProbeSize * StackProbeSize) .setMIFlag(MachineInstr::FrameSetup); @@ -675,8 +705,8 @@ // allocate a page { - const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, StackProbeSize); - BuildMI(testMBB, DL, TII.get(Opc), StackPtr) + const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, StackProbeSize); + BuildMI(testMBB, DL, TII.get(SUBOpc), StackPtr) .addReg(StackPtr) .addImm(StackProbeSize) .setMIFlag(MachineInstr::FrameSetup); Index: llvm/test/CodeGen/X86/stack-clash-large.ll =================================================================== --- llvm/test/CodeGen/X86/stack-clash-large.ll +++ llvm/test/CodeGen/X86/stack-clash-large.ll @@ -16,6 +16,7 @@ ; CHECK-X86-64-LABEL: foo: ; CHECK-X86-64: # %bb.0: +; CHECK-X86-64-NEXT: addq $16, %rsp ; CHECK-X86-64-NEXT: movq %rsp, %r11 ; CHECK-X86-64-NEXT: subq $69632, %r11 # imm = 0x11000 ; CHECK-X86-64-NEXT: .LBB0_1: @@ -24,7 +25,7 @@ ; CHECK-X86-64-NEXT: cmpq %r11, %rsp ; CHECK-X86-64-NEXT: jne .LBB0_1 ; CHECK-X86-64-NEXT:# %bb.2: -; CHECK-X86-64-NEXT: subq $2248, %rsp +; CHECK-X86-64-NEXT: subq $2264, %rsp ; CHECK-X86-64-NEXT: .cfi_def_cfa_offset 71888 ; CHECK-X86-64-NEXT: movl $1, 264(%rsp) ; CHECK-X86-64-NEXT: movl $1, 28664(%rsp) @@ -35,6 +36,7 @@ ; CHECK-X86-32-LABEL: foo: ; CHECK-X86-32: # %bb.0: +; CHECK-X86-32-NEXT: addl $16, %esp ; CHECK-X86-32-NEXT: movl %esp, %r11d ; CHECK-X86-32-NEXT: subl $69632, %r11d # imm = 0x11000 ; CHECK-X86-32-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 @@ -43,7 +45,7 @@ ; CHECK-X86-32-NEXT: cmpl %r11d, %esp ; CHECK-X86-32-NEXT: jne .LBB0_1 ; CHECK-X86-32-NEXT:# %bb.2: -; CHECK-X86-32-NEXT: subl $2380, %esp +; CHECK-X86-32-NEXT: subl $2396, %esp ; CHECK-X86-32-NEXT: .cfi_def_cfa_offset 72016 ; CHECK-X86-32-NEXT: movl $1, 392(%esp) ; CHECK-X86-32-NEXT: movl $1, 28792(%esp) Index: llvm/test/CodeGen/X86/stack-clash-medium-natural-probes-mutliple-objects.ll =================================================================== --- llvm/test/CodeGen/X86/stack-clash-medium-natural-probes-mutliple-objects.ll +++ llvm/test/CodeGen/X86/stack-clash-medium-natural-probes-mutliple-objects.ll @@ -7,9 +7,9 @@ define i32 @foo() local_unnamed_addr #0 { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-NEXT: subq $4080, %rsp # imm = 0xFF0 ; CHECK-NEXT: movq $0, (%rsp) -; CHECK-NEXT: subq $1784, %rsp # imm = 0x6F8 +; CHECK-NEXT: subq $1800, %rsp # imm = 0x708 ; CHECK-NEXT: .cfi_def_cfa_offset 5888 ; CHECK-NEXT: movl $1, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movl $2, {{[0-9]+}}(%rsp) Index: llvm/test/CodeGen/X86/stack-clash-medium-natural-probes.ll =================================================================== --- llvm/test/CodeGen/X86/stack-clash-medium-natural-probes.ll +++ llvm/test/CodeGen/X86/stack-clash-medium-natural-probes.ll @@ -8,9 +8,9 @@ ; CHECK-LABEL: foo: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-NEXT: subq $4080, %rsp # imm = 0xFF0 ; CHECK-NEXT: movq $0, (%rsp) -; CHECK-NEXT: subq $3784, %rsp # imm = 0xEC8 +; CHECK-NEXT: subq $3800, %rsp # imm = 0xED8 ; CHECK-NEXT: .cfi_def_cfa_offset 7888 ; CHECK-NEXT: movl $1, 264(%rsp) ; CHECK-NEXT: movl $1, 4664(%rsp) Index: llvm/test/CodeGen/X86/stack-clash-medium.ll =================================================================== --- llvm/test/CodeGen/X86/stack-clash-medium.ll +++ llvm/test/CodeGen/X86/stack-clash-medium.ll @@ -13,9 +13,9 @@ ; CHECK-X86-64-LABEL: foo: ; CHECK-X86-64: # %bb.0: -; CHECK-X86-64-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-X86-64-NEXT: subq $4080, %rsp # imm = 0xFF0 ; CHECK-X86-64-NEXT: movq $0, (%rsp) -; CHECK-X86-64-NEXT: subq $3784, %rsp # imm = 0xEC8 +; CHECK-X86-64-NEXT: subq $3800, %rsp # imm = 0xED8 ; CHECK-X86-64-NEXT: .cfi_def_cfa_offset 7888 ; CHECK-X86-64-NEXT: movl $1, 672(%rsp) ; CHECK-X86-64-NEXT: movl -128(%rsp), %eax @@ -26,9 +26,9 @@ ; CHECK-X86-32-LABEL: foo: ; CHECK-X86-32: # %bb.0: -; CHECK-X86-32-NEXT: subl $4096, %esp # imm = 0x1000 +; CHECK-X86-32-NEXT: subl $4080, %esp # imm = 0xFF0 ; CHECK-X86-32-NEXT: movl $0, (%esp) -; CHECK-X86-32-NEXT: subl $3916, %esp # imm = 0xF4C +; CHECK-X86-32-NEXT: subl $3932, %esp # imm = 0xF5C ; CHECK-X86-32-NEXT: .cfi_def_cfa_offset 8016 ; CHECK-X86-32-NEXT: movl $1, 800(%esp) ; CHECK-X86-32-NEXT: movl (%esp), %eax Index: llvm/test/CodeGen/X86/stack-clash-no-free-probe.ll =================================================================== --- llvm/test/CodeGen/X86/stack-clash-no-free-probe.ll +++ llvm/test/CodeGen/X86/stack-clash-no-free-probe.ll @@ -6,9 +6,9 @@ define i32 @foo(i64 %i) local_unnamed_addr #0 { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-NEXT: subq $4080, %rsp # imm = 0xFF0 ; CHECK-NEXT: movq $0, (%rsp) -; CHECK-NEXT: subq $3784, %rsp # imm = 0xEC8 +; CHECK-NEXT: subq $3800, %rsp # imm = 0xED8 ; CHECK-NEXT: .cfi_def_cfa_offset 7888 ; CHECK-NEXT: movl $1, -128(%rsp,%rdi,4) ; CHECK-NEXT: movl -128(%rsp), %eax Index: llvm/test/CodeGen/X86/stack-clash-small-large-align.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/stack-clash-small-large-align.ll @@ -0,0 +1,66 @@ +; RUN: llc < %s | FileCheck %s + + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @foo_noprotect() local_unnamed_addr { +; CHECK-LABEL: foo_noprotect: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: andq $-65536, %rsp +; CHECK-NEXT: subq $65536, %rsp +; CHECK-NEXT: movl $1, 392(%rsp) +; CHECK-NEXT: movl (%rsp), %eax +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: retq + + + + %a = alloca i32, i64 100, align 65536 + %b = getelementptr inbounds i32, i32* %a, i64 98 + store volatile i32 1, i32* %b + %c = load volatile i32, i32* %a + ret i32 %c +} + +define i32 @foo_protect() local_unnamed_addr #0 { +; CHECK-LABEL: foo_protect: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: andq $-65536, %rsp +; CHECK-NEXT: addq $65536, %rsp +; CHECK-NEXT: movq %rsp, %r11 +; CHECK-NEXT: subq $131072, %r11 +; CHECK-NEXT:.LBB0_1: +; CHECK-NEXT: subq $4096, %rsp +; CHECK-NEXT: movq $0, (%rsp) +; CHECK-NEXT: cmpq %r11, %rsp +; CHECK-NEXT: jne .LBB0_1 +; CHECK-NEXT:# %bb.2: +; CHECK-NEXT: movl $1, 392(%rsp) +; CHECK-NEXT: movl (%rsp), %eax +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: retq + + + %a = alloca i32, i64 100, align 65536 + %b = getelementptr inbounds i32, i32* %a, i64 98 + store volatile i32 1, i32* %b + %c = load volatile i32, i32* %a + ret i32 %c +} + +attributes #0 = {"probe-stack"="inline-asm"} Index: llvm/test/CodeGen/X86/stack-clash-unknown-call.ll =================================================================== --- llvm/test/CodeGen/X86/stack-clash-unknown-call.ll +++ llvm/test/CodeGen/X86/stack-clash-unknown-call.ll @@ -10,10 +10,10 @@ ;CHECK-LABEL: foo: ;CHECK: # %bb.0: -;CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 +;CHECK-NEXT: subq $4080, %rsp # imm = 0xFF0 ; it's important that we don't use the call as a probe here ;CHECK-NEXT: movq $0, (%rsp) -;CHECK-NEXT: subq $3912, %rsp # imm = 0xF48 +;CHECK-NEXT: subq $3928, %rsp # imm = 0xF58 ;CHECK-NEXT: .cfi_def_cfa_offset 8016 ;CHECK-NEXT: movq %rsp, %rdi ;CHECK-NEXT: movl $8000, %edx # imm = 0x1F40