Index: lib/Target/X86/X86FrameLowering.cpp =================================================================== --- lib/Target/X86/X86FrameLowering.cpp +++ lib/Target/X86/X86FrameLowering.cpp @@ -252,40 +252,76 @@ int64_t NumBytes, bool InEpilogue) const { bool isSub = NumBytes < 0; uint64_t Offset = isSub ? -NumBytes : NumBytes; + MachineInstr::MIFlag Flag = + isSub ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy; uint64_t Chunk = (1LL << 31) - 1; DebugLoc DL = MBB.findDebugLoc(MBBI); - while (Offset) { - if (Offset > Chunk) { - // Rather than emit a long series of instructions for large offsets, - // load the offset into a register and do one sub/add - unsigned Reg = 0; + if (Offset > Chunk) { + // Rather than emit a long series of instructions for large offsets, + // load the offset into a register and do one sub/add + unsigned Reg = 0; + unsigned Rax = (unsigned)(Is64Bit ? X86::RAX : X86::EAX); - if (isSub && !isEAXLiveIn(MBB)) - Reg = (unsigned)(Is64Bit ? X86::RAX : X86::EAX); + if (isSub && !isEAXLiveIn(MBB)) + Reg = Rax; + else + Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit); + + unsigned MovRIOpc = Is64Bit ? X86::MOV64ri : X86::MOV32ri; + unsigned AddSubRROpc = + isSub ? getSUBrrOpcode(Is64Bit) : getADDrrOpcode(Is64Bit); + if (Reg) { + BuildMI(MBB, MBBI, DL, TII.get(MovRIOpc), Reg) + .addImm(Offset) + .setMIFlag(Flag); + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AddSubRROpc), StackPtr) + .addReg(StackPtr) + .addReg(Reg); + MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. + return; + } else if (Offset > 8 * Chunk) { + // If we would need more than 8 add or sub instructions (a >16GB stack + // frame), it's worth spilling RAX to materialize this immediate. + // pushq %rax + // movabsq +-$Offset+-SlotSize, %rax + // addq %rsp, %rax + // xchg %rax, (%rsp) + // movq (%rsp), %rsp + assert(Is64Bit && "can't have 32-bit 16GB stack frame"); + BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r)) + .addReg(Rax, RegState::Kill) + .setMIFlag(Flag); + // Subtract is not commutative, so negate the offset and always use add. + // Subtract 8 less and add 8 more to account for the PUSH we just did. + if (isSub) + Offset = -(Offset - SlotSize); else - Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit); - - if (Reg) { - unsigned Opc = Is64Bit ? X86::MOV64ri : X86::MOV32ri; - BuildMI(MBB, MBBI, DL, TII.get(Opc), Reg) - .addImm(Offset); - Opc = isSub - ? getSUBrrOpcode(Is64Bit) - : getADDrrOpcode(Is64Bit); - MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) - .addReg(StackPtr) - .addReg(Reg); - MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. - Offset = 0; - continue; - } + Offset = Offset + SlotSize; + BuildMI(MBB, MBBI, DL, TII.get(MovRIOpc), Rax) + .addImm(Offset) + .setMIFlag(Flag); + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(X86::ADD64rr), Rax) + .addReg(Rax) + .addReg(StackPtr); + MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. + // Exchange the new SP in RAX with the top of the stack. + addRegOffset( + BuildMI(MBB, MBBI, DL, TII.get(X86::XCHG64rm), Rax).addReg(Rax), + StackPtr, false, 0); + // Load new SP from the top of the stack into RSP. + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), StackPtr), + StackPtr, false, 0); + return; } + } + while (Offset) { uint64_t ThisVal = std::min(Offset, Chunk); - if (ThisVal == (Is64Bit ? 8 : 4)) { - // Use push / pop instead. + if (ThisVal == SlotSize) { + // Use push / pop for slot sized adjustments as a size optimization. We + // need to find a dead register when using pop. unsigned Reg = isSub ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX) : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit); @@ -293,23 +329,16 @@ unsigned Opc = isSub ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r) : (Is64Bit ? X86::POP64r : X86::POP32r); - MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc)) - .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub)); - if (isSub) - MI->setFlag(MachineInstr::FrameSetup); - else - MI->setFlag(MachineInstr::FrameDestroy); + BuildMI(MBB, MBBI, DL, TII.get(Opc)) + .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub)) + .setMIFlag(Flag); Offset -= ThisVal; continue; } } - MachineInstrBuilder MI = BuildStackAdjustment( - MBB, MBBI, DL, isSub ? -ThisVal : ThisVal, InEpilogue); - if (isSub) - MI.setMIFlag(MachineInstr::FrameSetup); - else - MI.setMIFlag(MachineInstr::FrameDestroy); + BuildStackAdjustment(MBB, MBBI, DL, isSub ? -ThisVal : ThisVal, InEpilogue) + .setMIFlag(Flag); Offset -= ThisVal; } Index: test/CodeGen/X86/huge-stack-offset2.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/huge-stack-offset2.ll @@ -0,0 +1,62 @@ +; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefix=CHECK + +; Test how we handle pathologically large stack frames when RAX is live through +; the prologue and epilogue. + +declare void @bar(i8*) +declare void @llvm.va_start(i8*) + +; For stack frames between 2GB and 16GB, do multiple adjustments. + +define i32 @stack_frame_8gb(i32 %x, ...) nounwind { +; CHECK-LABEL: stack_frame_8gb: +; CHECK: subq ${{.*}}, %rsp # imm = 0x7FFFFFFF +; CHECK: subq ${{.*}}, %rsp # imm = 0x7FFFFFFF +; CHECK: subq ${{.*}}, %rsp # imm = 0x7FFFFFFF +; CHECK: subq ${{.*}}, %rsp # imm = 0x7FFFFFFF +; CHECK: subq ${{.*}}, %rsp +; CHECK: callq bar +; CHECK: addq ${{.*}}, %rsp # imm = 0x7FFFFFFF +; CHECK: addq ${{.*}}, %rsp # imm = 0x7FFFFFFF +; CHECK: addq ${{.*}}, %rsp # imm = 0x7FFFFFFF +; CHECK: addq ${{.*}}, %rsp # imm = 0x7FFFFFFF +; CHECK: addq ${{.*}}, %rsp +; CHECK: retq + %1 = alloca [u0x200000000 x i8] + %va = alloca i8, i32 24 + call void @llvm.va_start(i8* %va) + %2 = getelementptr inbounds [u0x200000000 x i8], [u0x200000000 x i8]* %1, i32 0, i32 0 + call void @bar(i8* %2) + ret i32 %x +} + +; For stack frames larger than 16GB, spill EAX instead of doing a linear number +; of adjustments. + +; This function should have a frame size of 0x4000000D0. The 0xD0 is 208 bytes +; from 24 bytes of va_list, 176 bytes of spilled varargs regparms, and 8 bytes +; of alignment. We subtract 8 less and add 8 more in the prologue and epilogue +; respectively to account for the PUSH. + +define i32 @stack_frame_16gb(i32 %x, ...) nounwind { +; CHECK-LABEL: stack_frame_16gb: +; CHECK: pushq %rax +; CHECK-NEXT: movabsq ${{.*}}, %rax # imm = 0xFFFFFFFBFFFFFF38 +; CHECK-NEXT: addq %rsp, %rax +; CHECK-NEXT: xchgq %rax, (%rsp) +; CHECK-NEXT: movq (%rsp), %rsp +; CHECK: callq bar +; CHECK: pushq %rax +; CHECK-NEXT: movabsq ${{.*}}, %rax # imm = 0x4000000D8 +; CHECK-NEXT: addq %rsp, %rax +; CHECK-NEXT: xchgq %rax, (%rsp) +; CHECK-NEXT: movq (%rsp), %rsp +; CHECK: retq + %1 = alloca [u0x400000000 x i8] + %va = alloca i8, i32 24 + call void @llvm.va_start(i8* %va) + %2 = getelementptr inbounds [u0x400000000 x i8], [u0x400000000 x i8]* %1, i32 0, i32 0 + call void @bar(i8* %2) + ret i32 %x +} +