diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -529,6 +529,7 @@ const X86TargetLowering &TLI = *STI.getTargetLowering(); assert(!(STI.is64Bit() && STI.isTargetWindowsCoreCLR()) && "different expansion expected for CoreCLR 64 bit"); + assert(InProlog && "different expansion expected outside prolog"); const uint64_t StackProbeSize = TLI.getStackProbeSize(MF); uint64_t ProbeChunk = StackProbeSize * 8; @@ -618,80 +619,91 @@ const X86Subtarget &STI = MF.getSubtarget(); const X86TargetLowering &TLI = *STI.getTargetLowering(); - const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi; + const unsigned MovMROpc = Is64Bit ? X86::MOV64mr : X86::MOV32mr, + MovRIOpc = Is64Bit ? X86::MOV64ri : X86::MOV32ri; const uint64_t StackProbeSize = TLI.getStackProbeSize(MF); - if (AlignOffset) { - if (AlignOffset < StackProbeSize) { - // Perform a first smaller allocation followed by a probe. - const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, AlignOffset); - MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(SUBOpc), StackPtr) - .addReg(StackPtr) - .addImm(AlignOffset) - .setMIFlag(MachineInstr::FrameSetup); - MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. + // TODO: we can avoid use of one of the registers if the `Offset` fits into a + // `cmp` instruction operand. - addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc)) - .setMIFlag(MachineInstr::FrameSetup), - StackPtr, false, 0) - .addImm(0) - .setMIFlag(MachineInstr::FrameSetup); - NumFrameExtraProbe++; - Offset -= AlignOffset; - } + // Setup some registers for loop use. + // We'll be using R/EAX and R11/EBX registers to minimize the chances they're + // live-in here and we need to spill. + X86MachineFunctionInfo *X86FI = MF.getInfo(); + const Register SizeReg = Uses64BitFramePtr ? X86::RAX : X86::EAX, + // FIXME: Should still use R11D on x86_64-gnux32? + PositionReg = Uses64BitFramePtr ? X86::R11 : X86::EBX; + const bool IsSizeRegLiveIn = MBB.isLiveIn(SizeReg); + const bool IsPositionRegLiveIn = MBB.isLiveIn(PositionReg); + + // TODO: this is probably wrong (and also hard to test, needs non-traditional + // ABIs + // TODO: how to correctly spill the registers if necessary? + + // Pick some save slots that are definitely not going to conflict (we're + // probing so we have a full page worth of offsets under `rsp` to use for our + // purposes. + int64_t SizeRegSaveSlot = + X86FI->getCalleeSavedFrameSize() + 16 * IsSizeRegLiveIn, + PositionRegSaveSlot = SizeRegSaveSlot + 8; + if (IsSizeRegLiveIn) + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMROpc)), X86::RSP, false, + SizeRegSaveSlot) + .addReg(SizeReg) + .setMIFlag(MachineInstr::FrameSetup); + if (IsPositionRegLiveIn) + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMROpc)), X86::RSP, false, + PositionRegSaveSlot) + .addReg(PositionReg) + .setMIFlag(MachineInstr::FrameSetup); + + // Prepare our loop registers + BuildMI(MBB, MBBI, DL, TII.get(MovRIOpc), SizeReg) + .addImm(-Offset) + .setMIFlag(MachineInstr::FrameSetup); + if (AlignOffset != 0 && AlignOffset < StackProbeSize) { + // Perform a first smaller allocation followed by regular sized probes. + BuildMI(MBB, MBBI, DL, TII.get(MovRIOpc), PositionReg) + .addImm(-AlignOffset) + .setMIFlag(MachineInstr::FrameSetup); + } else { + // Otherwise allocate in pages. + BuildMI(MBB, MBBI, DL, TII.get(MovRIOpc), PositionReg) + .addImm(-StackProbeSize) + .setMIFlag(MachineInstr::FrameSetup); } // Synthesize a loop NumFrameLoopProbe++; const BasicBlock *LLVM_BB = MBB.getBasicBlock(); - MachineBasicBlock *testMBB = MF.CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *tailMBB = MF.CreateMachineBasicBlock(LLVM_BB); - MachineFunction::iterator MBBIter = ++MBB.getIterator(); MF.insert(MBBIter, testMBB); MF.insert(MBBIter, tailMBB); - Register FinalStackProbed = Uses64BitFramePtr ? X86::R11 : X86::R11D; - BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::COPY), FinalStackProbed) - .addReg(StackPtr) - .setMIFlag(MachineInstr::FrameSetup); - - // save loop bound - { - const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, Offset); - BuildMI(MBB, MBBI, DL, TII.get(SUBOpc), FinalStackProbed) - .addReg(FinalStackProbed) - .addImm(Offset / StackProbeSize * StackProbeSize) - .setMIFlag(MachineInstr::FrameSetup); - } - - // allocate a page - { - const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, StackProbeSize); - BuildMI(testMBB, DL, TII.get(SUBOpc), StackPtr) - .addReg(StackPtr) - .addImm(StackProbeSize) - .setMIFlag(MachineInstr::FrameSetup); - } - // touch the page - addRegOffset(BuildMI(testMBB, DL, TII.get(MovMIOpc)) - .setMIFlag(MachineInstr::FrameSetup), - StackPtr, false, 0) + addRegReg(BuildMI(testMBB, DL, TII.get(X86::MOV8mi)) + .setMIFlag(MachineInstr::FrameSetup), + StackPtr, false, PositionReg, false) .addImm(0) .setMIFlag(MachineInstr::FrameSetup); - - // cmp with stack pointer bound + // "allocate" a next page + BuildMI(testMBB, DL, + TII.get(getSUBriOpcode(Uses64BitFramePtr, StackProbeSize)), + PositionReg) + .addReg(PositionReg) + .addImm(StackProbeSize) + .setMIFlag(MachineInstr::FrameSetup); + // cmp with the number of bytes we must allocate BuildMI(testMBB, DL, TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr)) - .addReg(StackPtr) - .addReg(FinalStackProbed) + .addReg(PositionReg) + .addReg(SizeReg) .setMIFlag(MachineInstr::FrameSetup); - // jump BuildMI(testMBB, DL, TII.get(X86::JCC_1)) .addMBB(testMBB) - .addImm(X86::COND_NE) + .addImm(X86::COND_G) .setMIFlag(MachineInstr::FrameSetup); testMBB->addSuccessor(testMBB); testMBB->addSuccessor(tailMBB); @@ -702,14 +714,12 @@ MBB.addSuccessor(testMBB); // handle tail - unsigned TailOffset = Offset % StackProbeSize; - if (TailOffset) { - const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, TailOffset); - BuildMI(*tailMBB, tailMBB->begin(), DL, TII.get(Opc), StackPtr) - .addReg(StackPtr) - .addImm(TailOffset) - .setMIFlag(MachineInstr::FrameSetup); - } + const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset); + BuildMI(*tailMBB, tailMBB->begin(), DL, TII.get(Opc), StackPtr) + .addReg(StackPtr) + .addImm(Offset) + .setMIFlag(MachineInstr::FrameSetup); + // TODO: pop old values for the registers we used. // Update Live In information recomputeLiveIns(*testMBB); diff --git a/llvm/test/CodeGen/X86/stack-clash-large-large-align.ll b/llvm/test/CodeGen/X86/stack-clash-large-large-align.ll --- a/llvm/test/CodeGen/X86/stack-clash-large-large-align.ll +++ b/llvm/test/CodeGen/X86/stack-clash-large-large-align.ll @@ -55,14 +55,15 @@ ; CHECK-NEXT: movq %r11, %rsp ; CHECK-NEXT: movq $0, (%rsp) ; CHECK-NEXT: .LBB1_4: -; CHECK-NEXT: movq %rsp, %r11 -; CHECK-NEXT: subq $73728, %r11 # imm = 0x12000 +; CHECK-NEXT: movabsq $-73728, %rax # imm = 0xFFFEE000 +; CHECK-NEXT: movabsq $-4096, %r11 # imm = 0xF000 ; CHECK-NEXT: .LBB1_5: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 -; CHECK-NEXT: movq $0, (%rsp) -; CHECK-NEXT: cmpq %r11, %rsp -; CHECK-NEXT: jne .LBB1_5 +; CHECK-NEXT: movb $0, (%rsp,%r11) +; CHECK-NEXT: subq $4096, %r11 # imm = 0x1000 +; CHECK-NEXT: cmpq %rax, %r11 +; CHECK-NEXT: jg .LBB1_5 ; CHECK-NEXT: # %bb.6: +; CHECK-NEXT: subq $73728, %rsp # imm = 0x12000 ; CHECK-NEXT: movl $1, 392(%rsp) ; CHECK-NEXT: movl $1, 28792(%rsp) ; CHECK-NEXT: movl (%rsp), %eax diff --git a/llvm/test/CodeGen/X86/stack-clash-large.ll b/llvm/test/CodeGen/X86/stack-clash-large.ll --- a/llvm/test/CodeGen/X86/stack-clash-large.ll +++ b/llvm/test/CodeGen/X86/stack-clash-large.ll @@ -5,15 +5,15 @@ define i32 @foo() local_unnamed_addr #0 { ; CHECK-X86-64-LABEL: foo: ; CHECK-X86-64: # %bb.0: -; CHECK-X86-64-NEXT: movq %rsp, %r11 -; CHECK-X86-64-NEXT: subq $69632, %r11 # imm = 0x11000 +; CHECK-X86-64-NEXT: movabsq $-71880, %rax # imm = 0xFFFEE738 +; CHECK-X86-64-NEXT: movabsq $-4096, %r11 # imm = 0xF000 ; CHECK-X86-64-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 -; CHECK-X86-64-NEXT: subq $4096, %rsp # imm = 0x1000 -; CHECK-X86-64-NEXT: movq $0, (%rsp) -; CHECK-X86-64-NEXT: cmpq %r11, %rsp -; CHECK-X86-64-NEXT: jne .LBB0_1 +; CHECK-X86-64-NEXT: movb $0, (%rsp,%r11) +; CHECK-X86-64-NEXT: subq $4096, %r11 # imm = 0x1000 +; CHECK-X86-64-NEXT: cmpq %rax, %r11 +; CHECK-X86-64-NEXT: jg .LBB0_1 ; CHECK-X86-64-NEXT: # %bb.2: -; CHECK-X86-64-NEXT: subq $2248, %rsp # imm = 0x8C8 +; CHECK-X86-64-NEXT: subq $71880, %rsp # imm = 0x118C8 ; CHECK-X86-64-NEXT: .cfi_def_cfa_offset 71888 ; CHECK-X86-64-NEXT: movl $1, 264(%rsp) ; CHECK-X86-64-NEXT: movl $1, 28664(%rsp) @@ -24,15 +24,15 @@ ; ; CHECK-X86-32-LABEL: foo: ; CHECK-X86-32: # %bb.0: -; CHECK-X86-32-NEXT: movl %esp, %r11d -; CHECK-X86-32-NEXT: subl $69632, %r11d # imm = 0x11000 +; CHECK-X86-32-NEXT: movl $-72012, %eax # imm = 0xFFFEE6B4 +; CHECK-X86-32-NEXT: movl $-4096, %ebx # imm = 0xF000 ; CHECK-X86-32-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 -; CHECK-X86-32-NEXT: subl $4096, %esp # imm = 0x1000 -; CHECK-X86-32-NEXT: movl $0, (%esp) -; CHECK-X86-32-NEXT: cmpl %r11d, %esp -; CHECK-X86-32-NEXT: jne .LBB0_1 +; CHECK-X86-32-NEXT: movb $0, (%esp,%ebx) +; CHECK-X86-32-NEXT: subl $4096, %ebx # imm = 0x1000 +; CHECK-X86-32-NEXT: cmpl %eax, %ebx +; CHECK-X86-32-NEXT: jg .LBB0_1 ; CHECK-X86-32-NEXT: # %bb.2: -; CHECK-X86-32-NEXT: subl $2380, %esp # imm = 0x94C +; CHECK-X86-32-NEXT: subl $72012, %esp # imm = 0x1194C ; CHECK-X86-32-NEXT: .cfi_def_cfa_offset 72016 ; CHECK-X86-32-NEXT: movl $1, 392(%esp) ; CHECK-X86-32-NEXT: movl $1, 28792(%esp) diff --git a/llvm/test/CodeGen/X86/stack-clash-small-large-align.ll b/llvm/test/CodeGen/X86/stack-clash-small-large-align.ll --- a/llvm/test/CodeGen/X86/stack-clash-small-large-align.ll +++ b/llvm/test/CodeGen/X86/stack-clash-small-large-align.ll @@ -51,14 +51,15 @@ ; CHECK-NEXT: movq %r11, %rsp ; CHECK-NEXT: movq $0, (%rsp) ; CHECK-NEXT: .LBB1_4: -; CHECK-NEXT: movq %rsp, %r11 -; CHECK-NEXT: subq $65536, %r11 # imm = 0x10000 +; CHECK-NEXT: movabsq $-65536, %rax # imm = 0xFFFF0000 +; CHECK-NEXT: movabsq $-4096, %r11 # imm = 0xF000 ; CHECK-NEXT: .LBB1_5: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 -; CHECK-NEXT: movq $0, (%rsp) -; CHECK-NEXT: cmpq %r11, %rsp -; CHECK-NEXT: jne .LBB1_5 +; CHECK-NEXT: movb $0, (%rsp,%r11) +; CHECK-NEXT: subq $4096, %r11 # imm = 0x1000 +; CHECK-NEXT: cmpq %rax, %r11 +; CHECK-NEXT: jg .LBB1_5 ; CHECK-NEXT: # %bb.6: +; CHECK-NEXT: subq $65536, %rsp # imm = 0x10000 ; CHECK-NEXT: movl $1, 392(%rsp) ; CHECK-NEXT: movl (%rsp), %eax ; CHECK-NEXT: movq %rbp, %rsp