diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -102,6 +102,16 @@ MFI.hasCopyImplyingStackAdjustment()); } +static unsigned getMOVriOpcode(bool Is64Bit, int64_t Imm) { + if (Is64Bit) { + if (isInt<32>(Imm)) + return X86::MOV64ri32; + return X86::MOV64ri; + } else { + return X86::MOV32ri; + } +} + static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) { if (IsLP64) { if (isInt<8>(Imm)) @@ -237,11 +247,10 @@ else Reg = TRI->findDeadCallerSavedReg(MBB, MBBI); - unsigned MovRIOpc = Is64Bit ? X86::MOV64ri : X86::MOV32ri; unsigned AddSubRROpc = isSub ? getSUBrrOpcode(Is64Bit) : getADDrrOpcode(Is64Bit); if (Reg) { - BuildMI(MBB, MBBI, DL, TII.get(MovRIOpc), Reg) + BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Is64Bit, Offset)), Reg) .addImm(Offset) .setMIFlag(Flag); MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AddSubRROpc), StackPtr) @@ -267,7 +276,7 @@ Offset = -(Offset - SlotSize); else Offset = Offset + SlotSize; - BuildMI(MBB, MBBI, DL, TII.get(MovRIOpc), Rax) + BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Is64Bit, Offset)), Rax) .addImm(Offset) .setMIFlag(Flag); MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(X86::ADD64rr), Rax) @@ -529,6 +538,7 @@ const X86TargetLowering &TLI = *STI.getTargetLowering(); assert(!(STI.is64Bit() && STI.isTargetWindowsCoreCLR()) && "different expansion expected for CoreCLR 64 bit"); + assert(InProlog && "different expansion expected outside prolog"); const uint64_t StackProbeSize = TLI.getStackProbeSize(MF); uint64_t ProbeChunk = StackProbeSize * 8; @@ -618,80 +628,98 @@ const X86Subtarget &STI = MF.getSubtarget(); const X86TargetLowering &TLI = *STI.getTargetLowering(); - const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi; + const unsigned MovMROpc = Is64Bit ? X86::MOV64mr : X86::MOV32mr, + MovRMOpc = Is64Bit ? X86::MOV64rm : X86::MOV32rm; const uint64_t StackProbeSize = TLI.getStackProbeSize(MF); - if (AlignOffset) { - if (AlignOffset < StackProbeSize) { - // Perform a first smaller allocation followed by a probe. - const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, AlignOffset); - MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(SUBOpc), StackPtr) - .addReg(StackPtr) - .addImm(AlignOffset) - .setMIFlag(MachineInstr::FrameSetup); - MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. + // TODO: we can avoid use of one of the registers if the `Offset` fits into a + // `cmp` instruction operand. + // TODO: given that we have register liveness information here, we could + // select the registers to use in a manner that almost never results in a + // spill in practice. - addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc)) - .setMIFlag(MachineInstr::FrameSetup), - StackPtr, false, 0) - .addImm(0) - .setMIFlag(MachineInstr::FrameSetup); - NumFrameExtraProbe++; - Offset -= AlignOffset; - } + // Setup some registers for loop use. + // We'll be using R/EAX and R11/EBX registers to minimize the chances we need + // to spill. + MachineRegisterInfo &MRI = MF.getRegInfo(); + LivePhysRegs UsedRegs(*TRI); + UsedRegs.addLiveOuts(MBB); + for (auto I = --MBB.end(); I != MBBI; --I) { + UsedRegs.stepBackward(*I); + UsedRegs.dump(); + } + + const Register SizeReg = Uses64BitFramePtr ? X86::RAX : X86::EAX, + PositionRegFor64 = Uses64BitFramePtr ? X86::R11 : X86::R11D, + PositionReg = Is64Bit ? PositionRegFor64 : X86::EBX; + const bool IsSizeRegLive = !UsedRegs.available(MRI, SizeReg); + const bool IsPositionRegLive = !UsedRegs.available(MRI, PositionReg); + + // Pick some save slots that are definitely not going to conflict (we're + // probing so we have a full page worth of offsets below `rsp` to use for our + // purposes. + // TODO: is there a less nasty way to stash them registers here? + int64_t SizeRegSaveSlot = 8 * IsSizeRegLive, + PositionRegSaveSlot = SizeRegSaveSlot + 8; + if (IsSizeRegLive) + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMROpc)), X86::RSP, false, + -SizeRegSaveSlot) + .addReg(SizeReg) + .setMIFlag(MachineInstr::FrameSetup); + if (IsPositionRegLive) + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMROpc)), X86::RSP, false, + -PositionRegSaveSlot) + .addReg(PositionReg) + .setMIFlag(MachineInstr::FrameSetup); + + // Prepare our loop registers + BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Is64Bit, -Offset)), SizeReg) + .addImm(-Offset) + .setMIFlag(MachineInstr::FrameSetup); + if (AlignOffset != 0 && AlignOffset < StackProbeSize) { + // Perform a first smaller allocation followed by regular sized probes. + BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Is64Bit, -AlignOffset)), + PositionReg) + .addImm(-AlignOffset) + .setMIFlag(MachineInstr::FrameSetup); + } else { + // Otherwise allocate in pages. + BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Is64Bit, -StackProbeSize)), + PositionReg) + .addImm(-StackProbeSize) + .setMIFlag(MachineInstr::FrameSetup); } // Synthesize a loop NumFrameLoopProbe++; const BasicBlock *LLVM_BB = MBB.getBasicBlock(); - MachineBasicBlock *testMBB = MF.CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *tailMBB = MF.CreateMachineBasicBlock(LLVM_BB); - MachineFunction::iterator MBBIter = ++MBB.getIterator(); MF.insert(MBBIter, testMBB); MF.insert(MBBIter, tailMBB); - Register FinalStackProbed = Uses64BitFramePtr ? X86::R11 : X86::R11D; - BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::COPY), FinalStackProbed) - .addReg(StackPtr) - .setMIFlag(MachineInstr::FrameSetup); - - // save loop bound - { - const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, Offset); - BuildMI(MBB, MBBI, DL, TII.get(SUBOpc), FinalStackProbed) - .addReg(FinalStackProbed) - .addImm(Offset / StackProbeSize * StackProbeSize) - .setMIFlag(MachineInstr::FrameSetup); - } - - // allocate a page - { - const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, StackProbeSize); - BuildMI(testMBB, DL, TII.get(SUBOpc), StackPtr) - .addReg(StackPtr) - .addImm(StackProbeSize) - .setMIFlag(MachineInstr::FrameSetup); - } - // touch the page - addRegOffset(BuildMI(testMBB, DL, TII.get(MovMIOpc)) - .setMIFlag(MachineInstr::FrameSetup), - StackPtr, false, 0) + addRegReg(BuildMI(testMBB, DL, TII.get(X86::MOV8mi)), StackPtr, false, + PositionReg, false) .addImm(0) .setMIFlag(MachineInstr::FrameSetup); - - // cmp with stack pointer bound + // "allocate" a next page + BuildMI(testMBB, DL, + TII.get(getSUBriOpcode(Uses64BitFramePtr, StackProbeSize)), + PositionReg) + .addReg(PositionReg) + .addImm(StackProbeSize) + .setMIFlag(MachineInstr::FrameSetup); + // cmp with the number of bytes we must allocate BuildMI(testMBB, DL, TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr)) - .addReg(StackPtr) - .addReg(FinalStackProbed) + .addReg(PositionReg) + .addReg(SizeReg) .setMIFlag(MachineInstr::FrameSetup); - // jump BuildMI(testMBB, DL, TII.get(X86::JCC_1)) .addMBB(testMBB) - .addImm(X86::COND_NE) + .addImm(X86::COND_G) .setMIFlag(MachineInstr::FrameSetup); testMBB->addSuccessor(testMBB); testMBB->addSuccessor(tailMBB); @@ -702,14 +730,22 @@ MBB.addSuccessor(testMBB); // handle tail - unsigned TailOffset = Offset % StackProbeSize; - if (TailOffset) { - const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, TailOffset); - BuildMI(*tailMBB, tailMBB->begin(), DL, TII.get(Opc), StackPtr) - .addReg(StackPtr) - .addImm(TailOffset) + const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset); + BuildMI(*tailMBB, tailMBB->begin(), DL, TII.get(Opc), StackPtr) + .addReg(StackPtr) + .addImm(Offset) + .setMIFlag(MachineInstr::FrameSetup); + if (IsPositionRegLive) + addRegOffset(BuildMI(*tailMBB, tailMBB->begin(), DL, TII.get(MovRMOpc)) + .addReg(PositionReg), + X86::RSP, false, -PositionRegSaveSlot) .setMIFlag(MachineInstr::FrameSetup); - } + if (IsSizeRegLive) + addRegOffset(BuildMI(*tailMBB, tailMBB->begin(), DL, TII.get(MovRMOpc)) + .addReg(SizeReg), + X86::RSP, false, -SizeRegSaveSlot) + .setMIFlag(MachineInstr::FrameSetup); + // TODO: pop old values for the registers we used. // Update Live In information recomputeLiveIns(*testMBB); diff --git a/llvm/test/CodeGen/X86/stack-clash-large-large-align.ll b/llvm/test/CodeGen/X86/stack-clash-large-large-align.ll --- a/llvm/test/CodeGen/X86/stack-clash-large-large-align.ll +++ b/llvm/test/CodeGen/X86/stack-clash-large-large-align.ll @@ -55,14 +55,15 @@ ; CHECK-NEXT: movq %r11, %rsp ; CHECK-NEXT: movq $0, (%rsp) ; CHECK-NEXT: .LBB1_4: -; CHECK-NEXT: movq %rsp, %r11 -; CHECK-NEXT: subq $73728, %r11 # imm = 0x12000 +; CHECK-NEXT: movabsq $-73728, %rax # imm = 0xFFFEE000 +; CHECK-NEXT: movabsq $-4096, %r11 # imm = 0xF000 ; CHECK-NEXT: .LBB1_5: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 -; CHECK-NEXT: movq $0, (%rsp) -; CHECK-NEXT: cmpq %r11, %rsp -; CHECK-NEXT: jne .LBB1_5 +; CHECK-NEXT: movb $0, (%rsp,%r11) +; CHECK-NEXT: subq $4096, %r11 # imm = 0x1000 +; CHECK-NEXT: cmpq %rax, %r11 +; CHECK-NEXT: jg .LBB1_5 ; CHECK-NEXT: # %bb.6: +; CHECK-NEXT: subq $73728, %rsp # imm = 0x12000 ; CHECK-NEXT: movl $1, 392(%rsp) ; CHECK-NEXT: movl $1, 28792(%rsp) ; CHECK-NEXT: movl (%rsp), %eax diff --git a/llvm/test/CodeGen/X86/stack-clash-large.ll b/llvm/test/CodeGen/X86/stack-clash-large.ll --- a/llvm/test/CodeGen/X86/stack-clash-large.ll +++ b/llvm/test/CodeGen/X86/stack-clash-large.ll @@ -1,45 +1,20 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp -; RUN: llc -mtriple=x86_64-linux-android < %s | FileCheck -check-prefix=CHECK-X86-64 %s -; RUN: llc -mtriple=i686-linux-android < %s | FileCheck -check-prefix=CHECK-X86-32 %s +; RUN: llc -mtriple=x86_64-linux-android < %s | FileCheck -check-prefix=CHECK-X86_64 %s +; RUN: llc -mtriple=i686-linux-android < %s | FileCheck -check-prefix=CHECK-X86 %s +; RUN: llc -mtriple=x86_64-gnux32 < %s | FileCheck -check-prefix=CHECK-X32-32 %s -define i32 @foo() local_unnamed_addr #0 { -; CHECK-X86-64-LABEL: foo: -; CHECK-X86-64: # %bb.0: -; CHECK-X86-64-NEXT: movq %rsp, %r11 -; CHECK-X86-64-NEXT: subq $69632, %r11 # imm = 0x11000 -; CHECK-X86-64-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 -; CHECK-X86-64-NEXT: subq $4096, %rsp # imm = 0x1000 -; CHECK-X86-64-NEXT: movq $0, (%rsp) -; CHECK-X86-64-NEXT: cmpq %r11, %rsp -; CHECK-X86-64-NEXT: jne .LBB0_1 -; CHECK-X86-64-NEXT: # %bb.2: -; CHECK-X86-64-NEXT: subq $2248, %rsp # imm = 0x8C8 -; CHECK-X86-64-NEXT: .cfi_def_cfa_offset 71888 -; CHECK-X86-64-NEXT: movl $1, 264(%rsp) -; CHECK-X86-64-NEXT: movl $1, 28664(%rsp) -; CHECK-X86-64-NEXT: movl -128(%rsp), %eax -; CHECK-X86-64-NEXT: addq $71880, %rsp # imm = 0x118C8 -; CHECK-X86-64-NEXT: .cfi_def_cfa_offset 8 -; CHECK-X86-64-NEXT: retq -; -; CHECK-X86-32-LABEL: foo: -; CHECK-X86-32: # %bb.0: -; CHECK-X86-32-NEXT: movl %esp, %r11d -; CHECK-X86-32-NEXT: subl $69632, %r11d # imm = 0x11000 -; CHECK-X86-32-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 -; CHECK-X86-32-NEXT: subl $4096, %esp # imm = 0x1000 -; CHECK-X86-32-NEXT: movl $0, (%esp) -; CHECK-X86-32-NEXT: cmpl %r11d, %esp -; CHECK-X86-32-NEXT: jne .LBB0_1 -; CHECK-X86-32-NEXT: # %bb.2: -; CHECK-X86-32-NEXT: subl $2380, %esp # imm = 0x94C -; CHECK-X86-32-NEXT: .cfi_def_cfa_offset 72016 -; CHECK-X86-32-NEXT: movl $1, 392(%esp) -; CHECK-X86-32-NEXT: movl $1, 28792(%esp) -; CHECK-X86-32-NEXT: movl (%esp), %eax -; CHECK-X86-32-NEXT: addl $72012, %esp # imm = 0x1194C -; CHECK-X86-32-NEXT: .cfi_def_cfa_offset 4 -; CHECK-X86-32-NEXT: retl +; Probe in the prelude +; define i32 @simple() "probe-stack"="inline-asm" { +; %a = alloca i32, i64 18000, align 16 +; %b0 = getelementptr inbounds i32, i32* %a, i64 98 +; %b1 = getelementptr inbounds i32, i32* %a, i64 7198 +; store volatile i32 1, i32* %b0 +; store volatile i32 1, i32* %b1 +; %c = load volatile i32, i32* %a +; ret i32 %c +; } + +define i32 @no_availble_registers() "probe-stack"="inline-asm" "no_caller_saved_registers" { %a = alloca i32, i64 18000, align 16 %b0 = getelementptr inbounds i32, i32* %a, i64 98 %b1 = getelementptr inbounds i32, i32* %a, i64 7198 @@ -49,4 +24,19 @@ ret i32 %c } -attributes #0 = {"probe-stack"="inline-asm"} +define void @no_availble_registers_many_args(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) "probe-stack"="inline-asm" "no_caller_saved_registers" { + %all = alloca i32, i64 18000, align 16 + %ab = add i32 %a, %b + %cd = add i32 %c, %d + %abcd = add i32 %ab, %cd + %b0 = getelementptr inbounds i32, i32* %all, i64 98 + store volatile i32 %abcd, i32* %b0 + + %ef = add i32 %e, %f + %gh = add i32 %g, %h + %efgh = add i32 %ef, %gh + %b1 = getelementptr inbounds i32, i32* %all, i64 7198 + store volatile i32 %efgh, i32* %b1 + + ret void +} diff --git a/llvm/test/CodeGen/X86/stack-clash-small-large-align.ll b/llvm/test/CodeGen/X86/stack-clash-small-large-align.ll --- a/llvm/test/CodeGen/X86/stack-clash-small-large-align.ll +++ b/llvm/test/CodeGen/X86/stack-clash-small-large-align.ll @@ -51,14 +51,15 @@ ; CHECK-NEXT: movq %r11, %rsp ; CHECK-NEXT: movq $0, (%rsp) ; CHECK-NEXT: .LBB1_4: -; CHECK-NEXT: movq %rsp, %r11 -; CHECK-NEXT: subq $65536, %r11 # imm = 0x10000 +; CHECK-NEXT: movabsq $-65536, %rax # imm = 0xFFFF0000 +; CHECK-NEXT: movabsq $-4096, %r11 # imm = 0xF000 ; CHECK-NEXT: .LBB1_5: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 -; CHECK-NEXT: movq $0, (%rsp) -; CHECK-NEXT: cmpq %r11, %rsp -; CHECK-NEXT: jne .LBB1_5 +; CHECK-NEXT: movb $0, (%rsp,%r11) +; CHECK-NEXT: subq $4096, %r11 # imm = 0x1000 +; CHECK-NEXT: cmpq %rax, %r11 +; CHECK-NEXT: jg .LBB1_5 ; CHECK-NEXT: # %bb.6: +; CHECK-NEXT: subq $65536, %rsp # imm = 0x10000 ; CHECK-NEXT: movl $1, 392(%rsp) ; CHECK-NEXT: movl (%rsp), %eax ; CHECK-NEXT: movq %rbp, %rsp