Index: include/llvm/CodeGen/MachineFunction.h =================================================================== --- include/llvm/CodeGen/MachineFunction.h +++ include/llvm/CodeGen/MachineFunction.h @@ -287,6 +287,14 @@ /// Should we be emitting segmented stack stuff for the function bool shouldSplitStack(); + /// \brief Should we be probing the stack for the function. + /// + /// Probing the stack means that we must read or write to the stack on every + /// page. This is to ensure that a guard page will be hit and stack overflow + /// can be detected. We insert instructions to do this when allocating from + /// the stack. + bool shouldProbeStack() const; + /// getNumBlockIDs - Return the number of MBB ID's allocated. /// unsigned getNumBlockIDs() const { return (unsigned)MBBNumbering.size(); } Index: lib/CodeGen/MachineFunction.cpp =================================================================== --- lib/CodeGen/MachineFunction.cpp +++ lib/CodeGen/MachineFunction.cpp @@ -147,6 +147,10 @@ return getFunction()->hasFnAttribute("split-stack"); } +bool MachineFunction::shouldProbeStack() const { + return getFunction()->hasFnAttribute("probe-stack"); +} + /// This discards all of the MachineBasicBlock numbers and recomputes them. /// This guarantees that the MBB numbers are sequential, dense, and match the /// ordering of the blocks within the function. If a specific MachineBasicBlock Index: lib/CodeGen/PrologEpilogInserter.cpp =================================================================== --- lib/CodeGen/PrologEpilogInserter.cpp +++ lib/CodeGen/PrologEpilogInserter.cpp @@ -773,6 +773,10 @@ // Add prologue to the function... TFI.emitPrologue(Fn, *SaveBlock); + // RestoreBlocks can be clobbered by emitPrologue. Recalculate it. + RestoreBlocks.clear(); + calculateSets(Fn); + // Add epilogue to restore the callee-save registers in each exiting block. for (MachineBasicBlock *RestoreBlock : RestoreBlocks) TFI.emitEpilogue(Fn, *RestoreBlock); Index: lib/Target/X86/X86FrameLowering.h =================================================================== --- lib/Target/X86/X86FrameLowering.h +++ lib/Target/X86/X86FrameLowering.h @@ -47,19 +47,40 @@ unsigned StackPtr; + void pushRegForStackProbeCall(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, + bool &IsAlive, + unsigned RegType, + uint64_t &NumBytes) const; + void popRegForStackProbeCall(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, + bool &IsAlive, + unsigned RegType, + uint64_t &NumBytes) const; + /// Emit a call to the target's stack probe function. This is required for all /// large stack allocations on Windows. The caller is required to materialize /// the number of bytes to probe in RAX/EAX. void emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL) const; + /// Emit a loop that probes the stack. The caller is required to materialize + /// the number of bytes to probe in RAX/EAX. + MachineInstr *emitStackProbes(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, + bool InProlog) const; + void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL) const; /// emitProlog/emitEpilog - These methods insert prolog and epilog code into /// the function. - void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + void emitPrologue(MachineFunction &MF, MachineBasicBlock &InMBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void adjustForSegmentedStacks(MachineFunction &MF, Index: lib/Target/X86/X86FrameLowering.cpp =================================================================== --- lib/Target/X86/X86FrameLowering.cpp +++ lib/Target/X86/X86FrameLowering.cpp @@ -197,13 +197,14 @@ return 0; } -static bool isEAXLiveIn(MachineFunction &MF) { +static bool isLiveIn(MachineFunction &MF, unsigned CheckReg) { + CheckReg = getX86SubSuperRegister(CheckReg, MVT::i32); + for (MachineRegisterInfo::livein_iterator II = MF.getRegInfo().livein_begin(), EE = MF.getRegInfo().livein_end(); II != EE; ++II) { unsigned Reg = II->first; - if (Reg == X86::RAX || Reg == X86::EAX || Reg == X86::AX || - Reg == X86::AH || Reg == X86::AL) + if (getX86SubSuperRegisterOrZero(Reg, MVT::i32) == CheckReg) return true; } @@ -250,7 +251,7 @@ // load the offset into a register and do one sub/add unsigned Reg = 0; - if (isSub && !isEAXLiveIn(*MBB.getParent())) + if (isSub && !isLiveIn(*MBB.getParent(), X86::EAX)) Reg = (unsigned)(Is64Bit ? X86::RAX : X86::EAX); else Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit); @@ -449,6 +450,10 @@ else Symbol = "_chkstk"; + // Save a pointer to the MI where we set AX. + MachineBasicBlock::iterator SetRAX = MBBI; + --SetRAX; + MachineInstrBuilder CI; // All current stack probes take AX and SP as input, clobber flags, and @@ -479,6 +484,204 @@ .addReg(X86::RSP) .addReg(X86::RAX); } + + // Apply the frame setup flag to all inserted instrs. + for (; SetRAX != MBBI; ++SetRAX) + SetRAX->setFlag(MachineInstr::FrameSetup); +} + +void X86FrameLowering::pushRegForStackProbeCall(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, + bool &IsAlive, + unsigned RegType, + uint64_t &NumBytes) const { + IsAlive = isLiveIn(MF, RegType); + + if (!IsAlive) { + return; + } + + auto Reg = getX86SubSuperRegister(RegType, Is64Bit ? MVT::i64 : MVT::i32); + + // Save the register on the stack. + BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r)) + .addReg(Reg, RegState::Kill) + .setMIFlag(MachineInstr::FrameSetup); + + // Reuse the space from the spill as a stack allocation. + NumBytes -= SlotSize; +} + +void X86FrameLowering::popRegForStackProbeCall(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, + bool &IsAlive, + unsigned RegType, + uint64_t &NumBytes) const { + if (!IsAlive) { + return; + } + + // Restore the register from the stack slot. + + auto Reg = getX86SubSuperRegister(RegType, Is64Bit ? MVT::i64 : MVT::i32); + + auto MIB = BuildMI(MF, DL, + TII.get(Is64Bit ? X86::MOV64rm : X86::MOV32rm), + Reg); + MachineInstr *MI = addRegOffset(MIB, StackPtr, false, NumBytes); + MI->setFlag(MachineInstr::FrameSetup); + MBB.insert(MBBI, MI); + + NumBytes += SlotSize; +} + +MachineInstr *X86FrameLowering::emitStackProbes(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, + bool InProlog) const { + const X86Subtarget &STI = MF.getSubtarget(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); + const BasicBlock *LLVM_BB = MBB.getBasicBlock(); + + // RAX contains the number of bytes of desired stack adjustment. + // The handling here assumes this value has already been updated so as to + // maintain stack alignment. + // + // We need to exit with RSP modified by this amount and execute suitable + // page touches to notify the OS that we're growing the stack responsibly. + // All stack probing must be done without modifying RSP. + + // Set up the new basic blocks + MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *ContinueMBB = MF.CreateMachineBasicBlock(LLVM_BB); + + MachineFunction::iterator MBBIter = MBB; + ++MBBIter; + + MF.insert(MBBIter, LoopMBB); + MF.insert(MBBIter, ContinueMBB); + + // Split MBB and move the tail portion down to ContinueMBB. + MachineBasicBlock::iterator BeforeMBBI = std::prev(MBBI); + ContinueMBB->splice(ContinueMBB->begin(), &MBB, MBBI, MBB.end()); + ContinueMBB->transferSuccessorsAndUpdatePHIs(&MBB); + + const int64_t GuardSize = 0x1000; + + auto RType = Is64Bit ? MVT::i64 : MVT::i32; + + auto InputReg = getX86SubSuperRegister(X86::RAX, RType); + + // Registers we need. For the normal case we use virtual + // registers. For the prolog expansion we use RAX, RCX and RDX. + MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetRegisterClass *RegClass = Is64Bit ? &X86::GR64RegClass : &X86::GR32RegClass; + unsigned + SizeReg = InProlog ? InputReg : + MRI.createVirtualRegister(RegClass), + StackReg = InProlog ? getX86SubSuperRegister(X86::RCX, RType) : + MRI.createVirtualRegister(RegClass), + CountReg = InProlog ? getX86SubSuperRegister(X86::RDX, RType) : + MRI.createVirtualRegister(RegClass), + PStackReg = InProlog ? getX86SubSuperRegister(X86::RCX, RType) : + MRI.createVirtualRegister(RegClass), + PCountReg = InProlog ? getX86SubSuperRegister(X86::RDX, RType) : + MRI.createVirtualRegister(RegClass), + LStackReg = InProlog ? getX86SubSuperRegister(X86::RCX, RType) : + MRI.createVirtualRegister(RegClass), + LCountReg = InProlog ? getX86SubSuperRegister(X86::RDX, RType) : + MRI.createVirtualRegister(RegClass); + + auto SPReg = getX86SubSuperRegister(X86::RSP, RType); + + if (InProlog) { + ContinueMBB->addLiveIn(InputReg); + LoopMBB->addLiveIn(InputReg); + LoopMBB->addLiveIn(LCountReg); + LoopMBB->addLiveIn(LStackReg); + + for (MachineBasicBlock::livein_iterator i = MBB.livein_begin(), + e = MBB.livein_end(); + i != e; i++) { + if (!LoopMBB->isLiveIn(*i)) + LoopMBB->addLiveIn(*i); + if (!ContinueMBB->isLiveIn(*i)) + ContinueMBB->addLiveIn(*i); + } + } else { + BuildMI(&MBB, DL, TII.get(X86::COPY), SizeReg).addReg(InputReg); + } + + BuildMI(&MBB, DL, TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), CountReg).addReg(SizeReg); + BuildMI(&MBB, DL, TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), StackReg).addReg(SPReg); + + if (!InProlog) { + BuildMI(LoopMBB, DL, TII.get(X86::PHI), PCountReg) + .addReg(CountReg) + .addMBB(&MBB) + .addReg(LCountReg) + .addMBB(LoopMBB); + BuildMI(LoopMBB, DL, TII.get(X86::PHI), PStackReg) + .addReg(StackReg) + .addMBB(&MBB) + .addReg(LStackReg) + .addMBB(LoopMBB); + } + + BuildMI(LoopMBB, DL, TII.get(Is64Bit ? X86::OR64mi8 : X86::OR32mi8)) + .addReg(PStackReg) + .addImm(1) + .addReg(0) + .addImm(0) + .addReg(0) + .addImm(0); + + BuildMI(LoopMBB, DL, TII.get(Is64Bit ? X86::SUB64ri32 : X86::SUB32ri), + LStackReg) + .addReg(PStackReg) + .addImm(GuardSize); + + BuildMI(LoopMBB, DL, TII.get(Is64Bit ? X86::SUB64ri32 : X86::SUB32ri), + LCountReg) + .addReg(PCountReg) + .addImm(GuardSize); + + BuildMI(LoopMBB, DL, TII.get(X86::JAE_1)).addMBB(LoopMBB); + + MachineBasicBlock::iterator ContinueMBBI = ContinueMBB->getFirstNonPHI(); + + // Now that the probing is done, add code to continueMBB to update + // the stack pointer for real. + BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(Is64Bit ? X86::SUB64rr : X86::SUB32rr), SPReg) + .addReg(SPReg) + .addReg(SizeReg); + + // Add the control flow edges we need. + MBB.addSuccessor(LoopMBB); + LoopMBB->addSuccessor(ContinueMBB); + LoopMBB->addSuccessor(LoopMBB); + + // Mark all the instructions added to the prolog as frame setup. + if (InProlog) { + for (++BeforeMBBI; BeforeMBBI != MBB.end(); ++BeforeMBBI) { + BeforeMBBI->setFlag(MachineInstr::FrameSetup); + } + for (MachineBasicBlock::iterator MI = LoopMBB->begin(); + MI != LoopMBB->end(); ++MI) { + MI->setFlag(MachineInstr::FrameSetup); + } + for (MachineBasicBlock::iterator CMBBI = ContinueMBB->begin(); + CMBBI != ContinueMBBI; ++CMBBI) { + CMBBI->setFlag(MachineInstr::FrameSetup); + } + } + + return ContinueMBBI; } static unsigned calculateSetFPREG(uint64_t SPAdjust) { @@ -608,10 +811,11 @@ */ void X86FrameLowering::emitPrologue(MachineFunction &MF, - MachineBasicBlock &MBB) const { + MachineBasicBlock &InMBB) const { assert(&STI == &MF.getSubtarget() && "MF used frame lowering for wrong subtarget"); - MachineBasicBlock::iterator MBBI = MBB.begin(); + MachineBasicBlock *MBB = &InMBB; + MachineBasicBlock::iterator MBBI = MBB->begin(); MachineFrameInfo *MFI = MF.getFrameInfo(); const Function *Fn = MF.getFunction(); MachineModuleInfo &MMI = MF.getMMI(); @@ -641,7 +845,9 @@ X86FI->setCalleeSavedFrameSize( X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta); - bool UseStackProbe = (STI.isOSWindows() && !STI.isTargetMachO()); + bool UseRedZone = false; + bool WindowsStackProbe = STI.isOSWindows() && !STI.isTargetMachO(); + bool UseStackProbe = WindowsStackProbe || MF.shouldProbeStack(); // The default stack probe size is 4096 if the function has no stackprobesize // attribute. @@ -661,19 +867,26 @@ !MFI->hasVarSizedObjects() && // No dynamic alloca. !MFI->adjustsStack() && // No calls. !IsWin64CC && // Win64 has no Red Zone + + !(UseStackProbe && StackSize > 128) && // Only use the Red Zone if we can + // fit the whole stack in it + // and thus stack probes won't be + // needed + !usesTheStack(MF) && // Don't push and pop. !MF.shouldSplitStack()) { // Regular stack uint64_t MinSize = X86FI->getCalleeSavedFrameSize(); if (HasFP) MinSize += SlotSize; StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0); MFI->setStackSize(StackSize); + UseRedZone = true; } // Insert stack pointer adjustment for later moving of return addr. Only // applies to tail call optimized functions where the callee argument stack // size is bigger than the callers. if (TailCallReturnAddrDelta < 0) { - BuildStackAdjustment(MBB, MBBI, DL, TailCallReturnAddrDelta, + BuildStackAdjustment(*MBB, MBBI, DL, TailCallReturnAddrDelta, /*InEpilogue=*/false) .setMIFlag(MachineInstr::FrameSetup); } @@ -714,7 +927,7 @@ MFI->setOffsetAdjustment(-NumBytes); // Save EBP/RBP into the appropriate stack slot. - BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r)) + BuildMI(*MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r)) .addReg(MachineFramePtr, RegState::Kill) .setMIFlag(MachineInstr::FrameSetup); @@ -722,24 +935,24 @@ // Mark the place where EBP/RBP was saved. // Define the current CFA rule to use the provided offset. assert(StackSize); - BuildCFI(MBB, MBBI, DL, + BuildCFI(*MBB, MBBI, DL, MCCFIInstruction::createDefCfaOffset(nullptr, 2 * stackGrowth)); // Change the rule for the FramePtr to be an "offset" rule. unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true); - BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createOffset( + BuildCFI(*MBB, MBBI, DL, MCCFIInstruction::createOffset( nullptr, DwarfFramePtr, 2 * stackGrowth)); } if (NeedsWinCFI) { - BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)) + BuildMI(*MBB, MBBI, DL, TII.get(X86::SEH_PushReg)) .addImm(FramePtr) .setMIFlag(MachineInstr::FrameSetup); } if (!IsWin64Prologue) { // Update EBP with the new base value. - BuildMI(MBB, MBBI, DL, + BuildMI(*MBB, MBBI, DL, TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), FramePtr) .addReg(StackPtr) @@ -750,7 +963,7 @@ // Mark effective beginning of when frame pointer becomes valid. // Define the current CFA to use the EBP/RBP register. unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true); - BuildCFI(MBB, MBBI, DL, + BuildCFI(*MBB, MBBI, DL, MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr)); } @@ -765,7 +978,7 @@ bool PushedRegs = false; int StackOffset = 2 * stackGrowth; - while (MBBI != MBB.end() && + while (MBBI != MBB->end() && MBBI->getFlag(MachineInstr::FrameSetup) && (MBBI->getOpcode() == X86::PUSH32r || MBBI->getOpcode() == X86::PUSH64r)) { @@ -777,13 +990,13 @@ // Mark callee-saved push instruction. // Define the current CFA rule to use the provided offset. assert(StackSize); - BuildCFI(MBB, MBBI, DL, + BuildCFI(*MBB, MBBI, DL, MCCFIInstruction::createDefCfaOffset(nullptr, StackOffset)); StackOffset += stackGrowth; } if (NeedsWinCFI) { - BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)).addImm(Reg).setMIFlag( + BuildMI(*MBB, MBBI, DL, TII.get(X86::SEH_PushReg)).addImm(Reg).setMIFlag( MachineInstr::FrameSetup); } } @@ -793,13 +1006,13 @@ // Don't do this for Win64, it needs to realign the stack after the prologue. if (!IsWin64Prologue && TRI->needsStackRealignment(MF)) { assert(HasFP && "There should be a frame pointer if stack is realigned."); - BuildStackAlignAND(MBB, MBBI, DL, MaxAlign); + BuildStackAlignAND(*MBB, MBBI, DL, MaxAlign); } // If there is an SUB32ri of ESP immediately before this instruction, merge // the two. This can be the case when tail call elimination is enabled and // the callee has more arguments then the caller. - NumBytes -= mergeSPUpdates(MBB, MBBI, true); + NumBytes -= mergeSPUpdates(*MBB, MBBI, true); // Adjust stack pointer: ESP -= numbytes. @@ -815,69 +1028,98 @@ if (IsWin64Prologue && TRI->needsStackRealignment(MF)) AlignedNumBytes = RoundUpToAlignment(AlignedNumBytes, MaxAlign); if (AlignedNumBytes >= StackProbeSize && UseStackProbe) { - // Check whether EAX is livein for this function. - bool isEAXAlive = isEAXLiveIn(MF); + assert(!UseRedZone && "The Red Zone is not accounted for in stack probes"); - if (isEAXAlive) { - // Sanity check that EAX is not livein for this function. - // It should not be, so throw an assert. - assert(!Is64Bit && "EAX is livein in x64 case!"); + uint64_t PageSize = 0x1000; - // Save EAX - BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r)) - .addReg(X86::EAX, RegState::Kill) - .setMIFlag(MachineInstr::FrameSetup); - } + // If we only need to probe 5 pages or below, we just emit instructions to + // do that instead of calling the function. This is just what the loop in + // the called function would do. 5 probes is what GCC will do before using + // a loop. + if (!WindowsStackProbe && NumBytes <= 5 * PageSize) { + for (uint64_t i = 0; i < NumBytes / PageSize; ++i) { + BuildMI(*MBB, MBBI, DL, TII.get(Is64Bit ? X86::OR64mi8 : X86::OR32mi8)) + .addReg(StackPtr) + .addImm(1) + .addReg(0) + .addImm(- (i + 1) * PageSize) + .addReg(0) + .addImm(0) + .setMIFlag(MachineInstr::FrameSetup); + } - if (Is64Bit) { - // Handle the 64-bit Windows ABI case where we need to call __chkstk. - // Function prologue is responsible for adjusting the stack pointer. - if (isUInt<32>(NumBytes)) { - BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) - .addImm(NumBytes) - .setMIFlag(MachineInstr::FrameSetup); - } else if (isInt<32>(NumBytes)) { - BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri32), X86::RAX) - .addImm(NumBytes) - .setMIFlag(MachineInstr::FrameSetup); + BuildMI(*MBB, MBBI, DL, TII.get(Is64Bit ? X86::SUB64ri32 : X86::SUB32ri), + StackPtr) + .addReg(StackPtr) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + + } else { + // In the large code model, we have to load the stack probe function into a scratch + // register to call it. R11 is used for that. + bool SpillR11 = WindowsStackProbe && + Is64Bit && + MF.getTarget().getCodeModel() == CodeModel::Large; + + // We spill the registers we need for the stack probes. + + bool RAXAlive, RCXAlive, RDXAlive, R11Alive; + + pushRegForStackProbeCall(MF, *MBB, MBBI, DL, RAXAlive, X86::RAX, NumBytes); + if (!WindowsStackProbe) { + pushRegForStackProbeCall(MF, *MBB, MBBI, DL, RCXAlive, X86::RCX, NumBytes); + pushRegForStackProbeCall(MF, *MBB, MBBI, DL, RDXAlive, X86::RDX, NumBytes); + } + if (SpillR11) { + pushRegForStackProbeCall(MF, *MBB, MBBI, DL, R11Alive, X86::R11, NumBytes); + } + + if (Is64Bit) { + if (isUInt<32>(NumBytes)) { + BuildMI(*MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + } else if (isInt<32>(NumBytes)) { + BuildMI(*MBB, MBBI, DL, TII.get(X86::MOV64ri32), X86::RAX) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + } else { + BuildMI(*MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + } } else { - BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX) + BuildMI(*MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) .addImm(NumBytes) .setMIFlag(MachineInstr::FrameSetup); } - } else { - // Allocate NumBytes-4 bytes on stack in case of isEAXAlive. - // We'll also use 4 already allocated bytes for EAX. - BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) - .addImm(isEAXAlive ? NumBytes - 4 : NumBytes) - .setMIFlag(MachineInstr::FrameSetup); - } - - // Save a pointer to the MI where we set AX. - MachineBasicBlock::iterator SetRAX = MBBI; - --SetRAX; - // Call __chkstk, __chkstk_ms, or __alloca. - emitStackProbeCall(MF, MBB, MBBI, DL); - - // Apply the frame setup flag to all inserted instrs. - for (; SetRAX != MBBI; ++SetRAX) - SetRAX->setFlag(MachineInstr::FrameSetup); + if (WindowsStackProbe) { + // Call __chkstk, __chkstk_ms, or __alloca. + emitStackProbeCall(MF, *MBB, MBBI, DL); + } else { + // Emit the stack probe loop. + MachineInstr *NextInstr = emitStackProbes(MF, *MBB, MBBI, DL, true); + MBBI = NextInstr; + MBB = NextInstr->getParent(); + } - if (isEAXAlive) { - // Restore EAX - MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), - X86::EAX), - StackPtr, false, NumBytes - 4); - MI->setFlag(MachineInstr::FrameSetup); - MBB.insert(MBBI, MI); + // Now we restore the spilled registers from the stack + if (SpillR11) { + popRegForStackProbeCall(MF, *MBB, MBBI, DL, R11Alive, X86::R11, NumBytes); + } + if (!WindowsStackProbe) { + popRegForStackProbeCall(MF, *MBB, MBBI, DL, RDXAlive, X86::RDX, NumBytes); + popRegForStackProbeCall(MF, *MBB, MBBI, DL, RCXAlive, X86::RCX, NumBytes); + } + popRegForStackProbeCall(MF, *MBB, MBBI, DL, RAXAlive, X86::RAX, NumBytes); } } else if (NumBytes) { - emitSPUpdate(MBB, MBBI, -(int64_t)NumBytes, /*InEpilogue=*/false); + emitSPUpdate(*MBB, MBBI, -(int64_t)NumBytes, /*InEpilogue=*/false); } if (NeedsWinCFI && NumBytes) - BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc)) + BuildMI(*MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc)) .addImm(NumBytes) .setMIFlag(MachineInstr::FrameSetup); @@ -885,19 +1127,19 @@ if (IsWin64Prologue && HasFP) { SEHFrameOffset = calculateSetFPREG(NumBytes); if (SEHFrameOffset) - addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr), + addRegOffset(BuildMI(*MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr), StackPtr, false, SEHFrameOffset); else - BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr).addReg(StackPtr); + BuildMI(*MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr).addReg(StackPtr); if (NeedsWinCFI) - BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame)) + BuildMI(*MBB, MBBI, DL, TII.get(X86::SEH_SetFrame)) .addImm(FramePtr) .addImm(SEHFrameOffset) .setMIFlag(MachineInstr::FrameSetup); } - while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) { + while (MBBI != MBB->end() && MBBI->getFlag(MachineInstr::FrameSetup)) { const MachineInstr *FrameInstr = &*MBBI; ++MBBI; @@ -909,7 +1151,7 @@ int Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg); Offset += SEHFrameOffset; - BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM)) + BuildMI(*MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM)) .addImm(Reg) .addImm(Offset) .setMIFlag(MachineInstr::FrameSetup); @@ -919,7 +1161,7 @@ } if (NeedsWinCFI) - BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue)) + BuildMI(*MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue)) .setMIFlag(MachineInstr::FrameSetup); // Realign stack after we spilled callee-saved registers (so that we'll be @@ -927,7 +1169,7 @@ // Win64 requires aligning the stack after the prologue. if (IsWin64Prologue && TRI->needsStackRealignment(MF)) { assert(HasFP && "There should be a frame pointer if stack is realigned."); - BuildStackAlignAND(MBB, MBBI, DL, MaxAlign); + BuildStackAlignAND(*MBB, MBBI, DL, MaxAlign); } // If we need a base pointer, set it up here. It's whatever the value @@ -937,14 +1179,14 @@ if (TRI->hasBasePointer(MF)) { // Update the base pointer with the current stack pointer. unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr; - BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr) + BuildMI(*MBB, MBBI, DL, TII.get(Opc), BasePtr) .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); if (X86FI->getRestoreBasePointer()) { // Stash value of base pointer. Saving RSP instead of EBP shortens // dependence chain. Used by SjLj EH. unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr; - addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), + addRegOffset(BuildMI(*MBB, MBBI, DL, TII.get(Opm)), FramePtr, true, X86FI->getRestoreBasePointerOffset()) .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); @@ -957,7 +1199,7 @@ // other way around. unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr; unsigned IgnoredFrameReg; - addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), BasePtr, true, + addRegOffset(BuildMI(*MBB, MBBI, DL, TII.get(Opm)), BasePtr, true, getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), IgnoredFrameReg)) .addReg(FramePtr) @@ -970,13 +1212,13 @@ if (!HasFP && NumBytes) { // Define the current CFA rule to use the provided offset. assert(StackSize); - BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaOffset( + BuildCFI(*MBB, MBBI, DL, MCCFIInstruction::createDefCfaOffset( nullptr, -StackSize + stackGrowth)); } // Emit DWARF info specifying the offsets of the callee-saved registers. if (PushedRegs) - emitCalleeSavedFrameMoves(MBB, MBBI, DL); + emitCalleeSavedFrameMoves(*MBB, MBBI, DL); } } Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -15047,7 +15047,7 @@ MachineFunction &MF = DAG.getMachineFunction(); bool SplitStack = MF.shouldSplitStack(); bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) || - SplitStack; + SplitStack || MF.shouldProbeStack(); SDLoc dl(Op); if (!Lower) { @@ -15125,6 +15125,7 @@ Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag); Flag = Chain.getValue(1); + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); @@ -20629,13 +20630,21 @@ MachineBasicBlock *BB) const { DebugLoc DL = MI->getDebugLoc(); - assert(!Subtarget->isTargetMachO()); + MachineBasicBlock *ResumeBB; - Subtarget->getFrameLowering()->emitStackProbeCall(*BB->getParent(), *BB, MI, - DL); + if (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) { + Subtarget->getFrameLowering()->emitStackProbeCall(*BB->getParent(), *BB, MI, + DL); + ResumeBB = BB; + } else { + MachineInstr *ResumeMI = + Subtarget->getFrameLowering()->emitStackProbes(*BB->getParent(), *BB, MI, + DL, false); + ResumeBB = ResumeMI->getParent(); + } MI->eraseFromParent(); // The pseudo instruction is gone now. - return BB; + return ResumeBB; } MachineBasicBlock * Index: test/CodeGen/X86/movtopush.ll =================================================================== --- test/CodeGen/X86/movtopush.ll +++ test/CodeGen/X86/movtopush.ll @@ -67,7 +67,6 @@ ; If we have a reserved frame, we should have pushes ; NORMAL-LABEL: test2: -; NORMAL-NOT: subl {{.*}} %esp ; NORMAL: pushl $4 ; NORMAL-NEXT: pushl $3 ; NORMAL-NEXT: pushl $2 Index: test/CodeGen/X86/stack-probes.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/stack-probes.ll @@ -0,0 +1,77 @@ +; RUN: llc -mtriple=i386-pc-linux-gnu < %s -o - | FileCheck --check-prefix=X86-LINUX %s +; RUN: llc -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck --check-prefix=X64-LINUX %s + +; Ensure calls to __probestack occur for large stack frames +define void @test() "probe-stack" { + %array = alloca [40096 x i8] + ret void + +; X86-LINUX-LABEL: test: +; X86-LINUX: movl $40096, %eax # imm = 0x9CA0 +; X86-LINUX-NEXT: movl %eax, %edx +; X86-LINUX-NEXT: movl %esp, %ecx +; X86-LINUX-LABEL: .LBB0_1 +; X86-LINUX-NEXT: orl $0, (%ecx) +; X86-LINUX-NEXT: subl $4096, %ecx +; X86-LINUX-NEXT: subl $4096, %edx +; X86-LINUX-NEXT: jae .LBB0_1 +; X86-LINUX: subl %eax, %esp +; X86-LINUX: addl $40096, %esp # imm = 0x9CA0 + +; X64-LINUX-LABEL: test: +; X64-LINUX: movl $40096, %eax # imm = 0x9CA0 +; X64-LINUX-NEXT: movq %rax, %rdx +; X64-LINUX-NEXT: movq %rsp, %rcx +; X64-LINUX-LABEL: .LBB0_1 +; X64-LINUX-NEXT: orq $0, (%rcx) +; X64-LINUX-NEXT: subq $4096, %rcx +; X64-LINUX-NEXT: subq $4096, %rdx +; X64-LINUX-NEXT: jae .LBB0_1 +; X64-LINUX: subq %rax, %rsp +; X64-LINUX: addq $40096, %rsp # imm = 0x9CA0 + +} + +; Ensure the stack is probed for medium stack frames +define void @testFast() "probe-stack" { + %array = alloca [4096 x i8] + ret void + +; X86-LINUX-LABEL: testFast: +; X86-LINUX: orl $0, -4096(%esp) +; X86-LINUX-NEXT: subl $4096, %esp # imm = 0x1000 + +; X64-LINUX-LABEL: testFast: +; X64-LINUX: orq $0, -4096(%rsp) +; X64-LINUX-NEXT: subq $4096, %rsp # imm = 0x1000 + +} + +; Ensure the stack is probed for dynamic allocations +define void @testDynamic(i64 %n) "probe-stack" { + %buf = alloca i8, i64 %n + ret void + +; X86-LINUX-LABEL: testDynamic: +; X86-LINUX: movl %eax, %ecx +; X86-LINUX-NEXT: movl %esp, %edx +; X86-LINUX-LABEL: .LBB2_1 +; X86-LINUX-NEXT: orl $0, (%edx) +; X86-LINUX-NEXT: subl $4096, %edx +; X86-LINUX-NEXT: subl $4096, %ecx +; X86-LINUX-NEXT: jae .LBB2_1 +; X86-LINUX: subl %eax, %esp + +; X64-LINUX-LABEL: testDynamic: +; X64-LINUX: leaq 15(%rdi), %rax +; X64-LINUX-NEXT: andq $-16, %rax +; X64-LINUX-NEXT: movq %rax, %rcx +; X64-LINUX-NEXT: movq %rsp, %rdx +; X64-LINUX-LABEL: .LBB2_1 +; X64-LINUX-NEXT: orq $0, (%rdx) +; X64-LINUX-NEXT: subq $4096, %rdx +; X64-LINUX-NEXT: subq $4096, %rcx +; X64-LINUX-NEXT: jae .LBB2_1 +; X64-LINUX: subq %rax, %rsp + +}