Index: include/llvm/CodeGen/MachineFunction.h =================================================================== --- include/llvm/CodeGen/MachineFunction.h +++ include/llvm/CodeGen/MachineFunction.h @@ -287,6 +287,14 @@ /// Should we be emitting segmented stack stuff for the function bool shouldSplitStack(); + /// \brief Should we be probing the stack for the function. + /// + /// Probing the stack means that we must read or write to the stack on every + /// page. This is to ensure that a guard page will be hit and stack overflow + /// can be detected. We insert instructions to do this when allocating from + /// the stack. + bool shouldProbeStack() const; + /// getNumBlockIDs - Return the number of MBB ID's allocated. /// unsigned getNumBlockIDs() const { return (unsigned)MBBNumbering.size(); } Index: lib/CodeGen/MachineFunction.cpp =================================================================== --- lib/CodeGen/MachineFunction.cpp +++ lib/CodeGen/MachineFunction.cpp @@ -147,6 +147,10 @@ return getFunction()->hasFnAttribute("split-stack"); } +bool MachineFunction::shouldProbeStack() const { + return getFunction()->hasFnAttribute("probe-stack"); +} + /// This discards all of the MachineBasicBlock numbers and recomputes them. /// This guarantees that the MBB numbers are sequential, dense, and match the /// ordering of the blocks within the function. If a specific MachineBasicBlock Index: lib/CodeGen/PrologEpilogInserter.cpp =================================================================== --- lib/CodeGen/PrologEpilogInserter.cpp +++ lib/CodeGen/PrologEpilogInserter.cpp @@ -773,6 +773,10 @@ // Add prologue to the function... TFI.emitPrologue(Fn, *SaveBlock); + // RestoreBlocks can be clobbered by emitPrologue. Recalculate it. + RestoreBlocks.clear(); + calculateSets(Fn); + // Add epilogue to restore the callee-save registers in each exiting block. for (MachineBasicBlock *RestoreBlock : RestoreBlocks) TFI.emitEpilogue(Fn, *RestoreBlock); Index: lib/Target/X86/X86FrameLowering.h =================================================================== --- lib/Target/X86/X86FrameLowering.h +++ lib/Target/X86/X86FrameLowering.h @@ -47,11 +47,26 @@ unsigned StackPtr; + void pushRegForStackProbeCall(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, + bool &IsAlive, + unsigned RegType, + uint64_t &NumBytes) const; + void popRegForStackProbeCall(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, + bool &IsAlive, + unsigned RegType, + uint64_t &NumBytes) const; /// Emit a call to the target's stack probe function. This is required for all /// large stack allocations on Windows. The caller is required to materialize /// the number of bytes to probe in RAX/EAX. - void emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, DebugLoc DL) const; + MachineInstr *emitStackProbes(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, + bool InProlog) const; void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, @@ -59,7 +74,7 @@ /// emitProlog/emitEpilog - These methods insert prolog and epilog code into /// the function. - void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + void emitPrologue(MachineFunction &MF, MachineBasicBlock &InMBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void adjustForSegmentedStacks(MachineFunction &MF, Index: lib/Target/X86/X86FrameLowering.cpp =================================================================== --- lib/Target/X86/X86FrameLowering.cpp +++ lib/Target/X86/X86FrameLowering.cpp @@ -197,13 +197,14 @@ return 0; } -static bool isEAXLiveIn(MachineFunction &MF) { +static bool isLiveIn(MachineFunction &MF, unsigned CheckReg) { + CheckReg = getX86SubSuperRegister(CheckReg, MVT::i32); + for (MachineRegisterInfo::livein_iterator II = MF.getRegInfo().livein_begin(), EE = MF.getRegInfo().livein_end(); II != EE; ++II) { unsigned Reg = II->first; - if (Reg == X86::RAX || Reg == X86::EAX || Reg == X86::AX || - Reg == X86::AH || Reg == X86::AL) + if (getX86SubSuperRegisterOrZero(Reg, MVT::i32) == CheckReg) return true; } @@ -250,7 +251,7 @@ // load the offset into a register and do one sub/add unsigned Reg = 0; - if (isSub && !isEAXLiveIn(*MBB.getParent())) + if (isSub && !isLiveIn(*MBB.getParent(), X86::EAX)) Reg = (unsigned)(Is64Bit ? X86::RAX : X86::EAX); else Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit); @@ -425,60 +426,198 @@ return false; } -void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - DebugLoc DL) const { - bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large; +void X86FrameLowering::pushRegForStackProbeCall(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, + bool &IsAlive, + unsigned RegType, + uint64_t &NumBytes) const { + IsAlive = isLiveIn(MF, RegType); - unsigned CallOp; - if (Is64Bit) - CallOp = IsLargeCodeModel ? X86::CALL64r : X86::CALL64pcrel32; - else - CallOp = X86::CALLpcrel32; + if (!IsAlive) { + return; + } - const char *Symbol; - if (Is64Bit) { - if (STI.isTargetCygMing()) { - Symbol = "___chkstk_ms"; - } else { - Symbol = "__chkstk"; - } - } else if (STI.isTargetCygMing()) - Symbol = "_alloca"; - else - Symbol = "_chkstk"; + auto Reg = getX86SubSuperRegister(RegType, Is64Bit ? MVT::i64 : MVT::i32); - MachineInstrBuilder CI; + // Save the register on the stack. + BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r)) + .addReg(Reg, RegState::Kill) + .setMIFlag(MachineInstr::FrameSetup); - // All current stack probes take AX and SP as input, clobber flags, and - // preserve all registers. x86_64 probes leave RSP unmodified. - if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) { - // For the large code model, we have to call through a register. Use R11, - // as it is scratch in all supported calling conventions. - BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::R11) - .addExternalSymbol(Symbol); - CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addReg(X86::R11); + // Reuse the space from the spill as a stack allocation. + NumBytes -= SlotSize; +} + +void X86FrameLowering::popRegForStackProbeCall(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, + bool &IsAlive, + unsigned RegType, + uint64_t &NumBytes) const { + if (!IsAlive) { + return; + } + + // Restore the register from the stack slot. + + auto Reg = getX86SubSuperRegister(RegType, Is64Bit ? MVT::i64 : MVT::i32); + + auto MIB = BuildMI(MF, DL, + TII.get(Is64Bit ? X86::MOV64rm : X86::MOV32rm), + Reg); + MachineInstr *MI = addRegOffset(MIB, StackPtr, false, NumBytes); + MI->setFlag(MachineInstr::FrameSetup); + MBB.insert(MBBI, MI); + + NumBytes += SlotSize; +} + +MachineInstr *X86FrameLowering::emitStackProbes(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, + bool InProlog) const { + const X86Subtarget &STI = MF.getSubtarget(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); + const BasicBlock *LLVM_BB = MBB.getBasicBlock(); + + // RAX contains the number of bytes of desired stack adjustment. + // The handling here assumes this value has already been updated so as to + // maintain stack alignment. + // + // We need to exit with RSP modified by this amount and execute suitable + // page touches to notify the OS that we're growing the stack responsibly. + // All stack probing must be done without modifying RSP. + + // Set up the new basic blocks + MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *ContinueMBB = MF.CreateMachineBasicBlock(LLVM_BB); + + MachineFunction::iterator MBBIter = MBB; + ++MBBIter; + + MF.insert(MBBIter, LoopMBB); + MF.insert(MBBIter, ContinueMBB); + + // Split MBB and move the tail portion down to ContinueMBB. + MachineBasicBlock::iterator BeforeMBBI = std::prev(MBBI); + ContinueMBB->splice(ContinueMBB->begin(), &MBB, MBBI, MBB.end()); + ContinueMBB->transferSuccessorsAndUpdatePHIs(&MBB); + + const int64_t GuardSize = 0x1000; + + auto RType = Is64Bit ? MVT::i64 : MVT::i32; + + auto InputReg = getX86SubSuperRegister(X86::RAX, RType); + + // Registers we need. For the normal case we use virtual + // registers. For the prolog expansion we use RAX, RCX and RDX. + MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetRegisterClass *RegClass = Is64Bit ? &X86::GR64RegClass : &X86::GR32RegClass; + unsigned + SizeReg = InProlog ? InputReg : + MRI.createVirtualRegister(RegClass), + StackReg = InProlog ? getX86SubSuperRegister(X86::RCX, RType) : + MRI.createVirtualRegister(RegClass), + CountReg = InProlog ? getX86SubSuperRegister(X86::RDX, RType) : + MRI.createVirtualRegister(RegClass), + PStackReg = InProlog ? getX86SubSuperRegister(X86::RCX, RType) : + MRI.createVirtualRegister(RegClass), + PCountReg = InProlog ? getX86SubSuperRegister(X86::RDX, RType) : + MRI.createVirtualRegister(RegClass), + LStackReg = InProlog ? getX86SubSuperRegister(X86::RCX, RType) : + MRI.createVirtualRegister(RegClass), + LCountReg = InProlog ? getX86SubSuperRegister(X86::RDX, RType) : + MRI.createVirtualRegister(RegClass); + + auto SPReg = getX86SubSuperRegister(X86::RSP, RType); + + if (InProlog) { + ContinueMBB->addLiveIn(InputReg); + LoopMBB->addLiveIn(InputReg); + LoopMBB->addLiveIn(LCountReg); + LoopMBB->addLiveIn(LStackReg); + + for (MachineBasicBlock::livein_iterator i = MBB.livein_begin(), + e = MBB.livein_end(); + i != e; i++) { + if (!LoopMBB->isLiveIn(*i)) + LoopMBB->addLiveIn(*i); + if (!ContinueMBB->isLiveIn(*i)) + ContinueMBB->addLiveIn(*i); + } } else { - CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addExternalSymbol(Symbol); + BuildMI(&MBB, DL, TII.get(X86::COPY), SizeReg).addReg(InputReg); } - unsigned AX = Is64Bit ? X86::RAX : X86::EAX; - unsigned SP = Is64Bit ? X86::RSP : X86::ESP; - CI.addReg(AX, RegState::Implicit) - .addReg(SP, RegState::Implicit) - .addReg(AX, RegState::Define | RegState::Implicit) - .addReg(SP, RegState::Define | RegState::Implicit) - .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); + BuildMI(&MBB, DL, TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), CountReg).addReg(SizeReg); + BuildMI(&MBB, DL, TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), StackReg).addReg(SPReg); + + if (!InProlog) { + BuildMI(LoopMBB, DL, TII.get(X86::PHI), PCountReg) + .addReg(CountReg) + .addMBB(&MBB) + .addReg(LCountReg) + .addMBB(LoopMBB); + BuildMI(LoopMBB, DL, TII.get(X86::PHI), PStackReg) + .addReg(StackReg) + .addMBB(&MBB) + .addReg(LStackReg) + .addMBB(LoopMBB); + } - if (Is64Bit) { - // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp - // themselves. It also does not clobber %rax so we can reuse it when - // adjusting %rsp. - BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), X86::RSP) - .addReg(X86::RSP) - .addReg(X86::RAX); + BuildMI(LoopMBB, DL, TII.get(Is64Bit ? X86::OR64mi8 : X86::OR32mi8)) + .addReg(PStackReg) + .addImm(1) + .addReg(0) + .addImm(0) + .addReg(0) + .addImm(0); + + BuildMI(LoopMBB, DL, TII.get(Is64Bit ? X86::SUB64ri32 : X86::SUB32ri), + LStackReg) + .addReg(PStackReg) + .addImm(GuardSize); + + BuildMI(LoopMBB, DL, TII.get(Is64Bit ? X86::SUB64ri32 : X86::SUB32ri), + LCountReg) + .addReg(PCountReg) + .addImm(GuardSize); + + BuildMI(LoopMBB, DL, TII.get(X86::JAE_1)).addMBB(LoopMBB); + + MachineBasicBlock::iterator ContinueMBBI = ContinueMBB->getFirstNonPHI(); + + // Now that the probing is done, add code to continueMBB to update + // the stack pointer for real. + BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(Is64Bit ? X86::SUB64rr : X86::SUB32rr), SPReg) + .addReg(SPReg) + .addReg(SizeReg); + + // Add the control flow edges we need. + MBB.addSuccessor(LoopMBB); + LoopMBB->addSuccessor(ContinueMBB); + LoopMBB->addSuccessor(LoopMBB); + + // Mark all the instructions added to the prolog as frame setup. + if (InProlog) { + for (++BeforeMBBI; BeforeMBBI != MBB.end(); ++BeforeMBBI) { + BeforeMBBI->setFlag(MachineInstr::FrameSetup); + } + for (MachineBasicBlock::iterator MI = LoopMBB->begin(); + MI != LoopMBB->end(); ++MI) { + MI->setFlag(MachineInstr::FrameSetup); + } + for (MachineBasicBlock::iterator CMBBI = ContinueMBB->begin(); + CMBBI != ContinueMBBI; ++CMBBI) { + CMBBI->setFlag(MachineInstr::FrameSetup); + } } + + return ContinueMBBI; } static unsigned calculateSetFPREG(uint64_t SPAdjust) { @@ -608,10 +747,11 @@ */ void X86FrameLowering::emitPrologue(MachineFunction &MF, - MachineBasicBlock &MBB) const { + MachineBasicBlock &InMBB) const { assert(&STI == &MF.getSubtarget() && "MF used frame lowering for wrong subtarget"); - MachineBasicBlock::iterator MBBI = MBB.begin(); + MachineBasicBlock *MBB = &InMBB; + MachineBasicBlock::iterator MBBI = MBB->begin(); MachineFrameInfo *MFI = MF.getFrameInfo(); const Function *Fn = MF.getFunction(); MachineModuleInfo &MMI = MF.getMMI(); @@ -641,7 +781,9 @@ X86FI->setCalleeSavedFrameSize( X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta); - bool UseStackProbe = (STI.isOSWindows() && !STI.isTargetMachO()); + bool UseRedZone = false; + bool UseStackProbe = + (STI.isOSWindows() && !STI.isTargetMachO()) || MF.shouldProbeStack(); // The default stack probe size is 4096 if the function has no stackprobesize // attribute. @@ -661,19 +803,26 @@ !MFI->hasVarSizedObjects() && // No dynamic alloca. !MFI->adjustsStack() && // No calls. !IsWin64CC && // Win64 has no Red Zone + + !(UseStackProbe && StackSize > 128) && // Only use the Red Zone if we can + // fit the whole stack in it + // and thus stack probes won't be + // needed + !usesTheStack(MF) && // Don't push and pop. !MF.shouldSplitStack()) { // Regular stack uint64_t MinSize = X86FI->getCalleeSavedFrameSize(); if (HasFP) MinSize += SlotSize; StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0); MFI->setStackSize(StackSize); + UseRedZone = true; } // Insert stack pointer adjustment for later moving of return addr. Only // applies to tail call optimized functions where the callee argument stack // size is bigger than the callers. if (TailCallReturnAddrDelta < 0) { - BuildStackAdjustment(MBB, MBBI, DL, TailCallReturnAddrDelta, + BuildStackAdjustment(*MBB, MBBI, DL, TailCallReturnAddrDelta, /*InEpilogue=*/false) .setMIFlag(MachineInstr::FrameSetup); } @@ -714,7 +863,7 @@ MFI->setOffsetAdjustment(-NumBytes); // Save EBP/RBP into the appropriate stack slot. - BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r)) + BuildMI(*MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r)) .addReg(MachineFramePtr, RegState::Kill) .setMIFlag(MachineInstr::FrameSetup); @@ -722,24 +871,24 @@ // Mark the place where EBP/RBP was saved. // Define the current CFA rule to use the provided offset. assert(StackSize); - BuildCFI(MBB, MBBI, DL, + BuildCFI(*MBB, MBBI, DL, MCCFIInstruction::createDefCfaOffset(nullptr, 2 * stackGrowth)); // Change the rule for the FramePtr to be an "offset" rule. unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true); - BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createOffset( + BuildCFI(*MBB, MBBI, DL, MCCFIInstruction::createOffset( nullptr, DwarfFramePtr, 2 * stackGrowth)); } if (NeedsWinCFI) { - BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)) + BuildMI(*MBB, MBBI, DL, TII.get(X86::SEH_PushReg)) .addImm(FramePtr) .setMIFlag(MachineInstr::FrameSetup); } if (!IsWin64Prologue) { // Update EBP with the new base value. - BuildMI(MBB, MBBI, DL, + BuildMI(*MBB, MBBI, DL, TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), FramePtr) .addReg(StackPtr) @@ -750,7 +899,7 @@ // Mark effective beginning of when frame pointer becomes valid. // Define the current CFA to use the EBP/RBP register. unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true); - BuildCFI(MBB, MBBI, DL, + BuildCFI(*MBB, MBBI, DL, MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr)); } @@ -765,7 +914,7 @@ bool PushedRegs = false; int StackOffset = 2 * stackGrowth; - while (MBBI != MBB.end() && + while (MBBI != MBB->end() && MBBI->getFlag(MachineInstr::FrameSetup) && (MBBI->getOpcode() == X86::PUSH32r || MBBI->getOpcode() == X86::PUSH64r)) { @@ -777,13 +926,13 @@ // Mark callee-saved push instruction. // Define the current CFA rule to use the provided offset. assert(StackSize); - BuildCFI(MBB, MBBI, DL, + BuildCFI(*MBB, MBBI, DL, MCCFIInstruction::createDefCfaOffset(nullptr, StackOffset)); StackOffset += stackGrowth; } if (NeedsWinCFI) { - BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)).addImm(Reg).setMIFlag( + BuildMI(*MBB, MBBI, DL, TII.get(X86::SEH_PushReg)).addImm(Reg).setMIFlag( MachineInstr::FrameSetup); } } @@ -793,13 +942,13 @@ // Don't do this for Win64, it needs to realign the stack after the prologue. if (!IsWin64Prologue && TRI->needsStackRealignment(MF)) { assert(HasFP && "There should be a frame pointer if stack is realigned."); - BuildStackAlignAND(MBB, MBBI, DL, MaxAlign); + BuildStackAlignAND(*MBB, MBBI, DL, MaxAlign); } // If there is an SUB32ri of ESP immediately before this instruction, merge // the two. This can be the case when tail call elimination is enabled and // the callee has more arguments then the caller. - NumBytes -= mergeSPUpdates(MBB, MBBI, true); + NumBytes -= mergeSPUpdates(*MBB, MBBI, true); // Adjust stack pointer: ESP -= numbytes. @@ -815,69 +964,86 @@ if (IsWin64Prologue && TRI->needsStackRealignment(MF)) AlignedNumBytes = RoundUpToAlignment(AlignedNumBytes, MaxAlign); if (AlignedNumBytes >= StackProbeSize && UseStackProbe) { - // Check whether EAX is livein for this function. - bool isEAXAlive = isEAXLiveIn(MF); + assert(!UseRedZone && "The Red Zone is not accounted for in stack probes"); - if (isEAXAlive) { - // Sanity check that EAX is not livein for this function. - // It should not be, so throw an assert. - assert(!Is64Bit && "EAX is livein in x64 case!"); + uint64_t PageSize = 0x1000; - // Save EAX - BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r)) - .addReg(X86::EAX, RegState::Kill) - .setMIFlag(MachineInstr::FrameSetup); - } + // If we only need to probe 5 pages or below, we just emit instructions to + // do that instead of calling the function. This is just what the loop in + // the called function would do. 5 probes is what GCC will do before using + // a loop. + if (NumBytes <= 5 * PageSize) { + for (uint64_t i = 0; i < NumBytes / PageSize; ++i) { + BuildMI(*MBB, MBBI, DL, TII.get(Is64Bit ? X86::OR64mi8 : X86::OR32mi8)) + .addReg(StackPtr) + .addImm(1) + .addReg(0) + .addImm(- (i + 1) * PageSize) + .addReg(0) + .addImm(0) + .setMIFlag(MachineInstr::FrameSetup); + } - if (Is64Bit) { - // Handle the 64-bit Windows ABI case where we need to call __chkstk. - // Function prologue is responsible for adjusting the stack pointer. - if (isUInt<32>(NumBytes)) { - BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) - .addImm(NumBytes) - .setMIFlag(MachineInstr::FrameSetup); - } else if (isInt<32>(NumBytes)) { - BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri32), X86::RAX) - .addImm(NumBytes) - .setMIFlag(MachineInstr::FrameSetup); + BuildMI(*MBB, MBBI, DL, TII.get(Is64Bit ? X86::SUB64ri32 : X86::SUB32ri), + StackPtr) + .addReg(StackPtr) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + + } else { + // We spill the registers we need for the stack probe loop. + + bool RAXAlive, RCXAlive, RDXAlive; + + // TODO: Push the registers if they are callee-saved. + + pushRegForStackProbeCall(MF, *MBB, MBBI, DL, RAXAlive, X86::RAX, NumBytes); + pushRegForStackProbeCall(MF, *MBB, MBBI, DL, RCXAlive, X86::RCX, NumBytes); + pushRegForStackProbeCall(MF, *MBB, MBBI, DL, RDXAlive, X86::RDX, NumBytes); + + if (Is64Bit) { + // Handle the 64-bit Windows ABI case where we need to call __chkstk. + // Function prologue is responsible for adjusting the stack pointer. + if (isUInt<32>(NumBytes)) { + BuildMI(*MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + } else if (isInt<32>(NumBytes)) { + BuildMI(*MBB, MBBI, DL, TII.get(X86::MOV64ri32), X86::RAX) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + } else { + BuildMI(*MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + } } else { - BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX) + BuildMI(*MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) .addImm(NumBytes) .setMIFlag(MachineInstr::FrameSetup); } - } else { - // Allocate NumBytes-4 bytes on stack in case of isEAXAlive. - // We'll also use 4 already allocated bytes for EAX. - BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) - .addImm(isEAXAlive ? NumBytes - 4 : NumBytes) - .setMIFlag(MachineInstr::FrameSetup); - } - // Save a pointer to the MI where we set AX. - MachineBasicBlock::iterator SetRAX = MBBI; - --SetRAX; + // Save a pointer to the MI where we set AX. + MachineBasicBlock::iterator SetRAX = MBBI; + --SetRAX; - // Call __chkstk, __chkstk_ms, or __alloca. - emitStackProbeCall(MF, MBB, MBBI, DL); + // Emit the stack probes. + MachineInstr *NextInstr = emitStackProbes(MF, *MBB, MBBI, DL, true); + MBBI = NextInstr; + MBB = NextInstr->getParent(); - // Apply the frame setup flag to all inserted instrs. - for (; SetRAX != MBBI; ++SetRAX) - SetRAX->setFlag(MachineInstr::FrameSetup); + // Now we restore the spilled registers from the stack - if (isEAXAlive) { - // Restore EAX - MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), - X86::EAX), - StackPtr, false, NumBytes - 4); - MI->setFlag(MachineInstr::FrameSetup); - MBB.insert(MBBI, MI); + popRegForStackProbeCall(MF, *MBB, MBBI, DL, RDXAlive, X86::RDX, NumBytes); + popRegForStackProbeCall(MF, *MBB, MBBI, DL, RCXAlive, X86::RCX, NumBytes); + popRegForStackProbeCall(MF, *MBB, MBBI, DL, RAXAlive, X86::RAX, NumBytes); } } else if (NumBytes) { - emitSPUpdate(MBB, MBBI, -(int64_t)NumBytes, /*InEpilogue=*/false); + emitSPUpdate(*MBB, MBBI, -(int64_t)NumBytes, /*InEpilogue=*/false); } if (NeedsWinCFI && NumBytes) - BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc)) + BuildMI(*MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc)) .addImm(NumBytes) .setMIFlag(MachineInstr::FrameSetup); @@ -885,19 +1051,19 @@ if (IsWin64Prologue && HasFP) { SEHFrameOffset = calculateSetFPREG(NumBytes); if (SEHFrameOffset) - addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr), + addRegOffset(BuildMI(*MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr), StackPtr, false, SEHFrameOffset); else - BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr).addReg(StackPtr); + BuildMI(*MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr).addReg(StackPtr); if (NeedsWinCFI) - BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame)) + BuildMI(*MBB, MBBI, DL, TII.get(X86::SEH_SetFrame)) .addImm(FramePtr) .addImm(SEHFrameOffset) .setMIFlag(MachineInstr::FrameSetup); } - while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) { + while (MBBI != MBB->end() && MBBI->getFlag(MachineInstr::FrameSetup)) { const MachineInstr *FrameInstr = &*MBBI; ++MBBI; @@ -909,7 +1075,7 @@ int Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg); Offset += SEHFrameOffset; - BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM)) + BuildMI(*MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM)) .addImm(Reg) .addImm(Offset) .setMIFlag(MachineInstr::FrameSetup); @@ -919,7 +1085,7 @@ } if (NeedsWinCFI) - BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue)) + BuildMI(*MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue)) .setMIFlag(MachineInstr::FrameSetup); // Realign stack after we spilled callee-saved registers (so that we'll be @@ -927,7 +1093,7 @@ // Win64 requires aligning the stack after the prologue. if (IsWin64Prologue && TRI->needsStackRealignment(MF)) { assert(HasFP && "There should be a frame pointer if stack is realigned."); - BuildStackAlignAND(MBB, MBBI, DL, MaxAlign); + BuildStackAlignAND(*MBB, MBBI, DL, MaxAlign); } // If we need a base pointer, set it up here. It's whatever the value @@ -937,14 +1103,14 @@ if (TRI->hasBasePointer(MF)) { // Update the base pointer with the current stack pointer. unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr; - BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr) + BuildMI(*MBB, MBBI, DL, TII.get(Opc), BasePtr) .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); if (X86FI->getRestoreBasePointer()) { // Stash value of base pointer. Saving RSP instead of EBP shortens // dependence chain. Used by SjLj EH. unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr; - addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), + addRegOffset(BuildMI(*MBB, MBBI, DL, TII.get(Opm)), FramePtr, true, X86FI->getRestoreBasePointerOffset()) .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); @@ -957,7 +1123,7 @@ // other way around. unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr; unsigned IgnoredFrameReg; - addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), BasePtr, true, + addRegOffset(BuildMI(*MBB, MBBI, DL, TII.get(Opm)), BasePtr, true, getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), IgnoredFrameReg)) .addReg(FramePtr) @@ -970,13 +1136,13 @@ if (!HasFP && NumBytes) { // Define the current CFA rule to use the provided offset. assert(StackSize); - BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaOffset( + BuildCFI(*MBB, MBBI, DL, MCCFIInstruction::createDefCfaOffset( nullptr, -StackSize + stackGrowth)); } // Emit DWARF info specifying the offsets of the callee-saved registers. if (PushedRegs) - emitCalleeSavedFrameMoves(MBB, MBBI, DL); + emitCalleeSavedFrameMoves(*MBB, MBBI, DL); } } Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -15047,7 +15047,7 @@ MachineFunction &MF = DAG.getMachineFunction(); bool SplitStack = MF.shouldSplitStack(); bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) || - SplitStack; + SplitStack || MF.shouldProbeStack(); SDLoc dl(Op); if (!Lower) { @@ -15125,6 +15125,7 @@ Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag); Flag = Chain.getValue(1); + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); @@ -20622,14 +20623,12 @@ X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, MachineBasicBlock *BB) const { DebugLoc DL = MI->getDebugLoc(); - - assert(!Subtarget->isTargetMachO()); - - Subtarget->getFrameLowering()->emitStackProbeCall(*BB->getParent(), *BB, MI, - DL); - + MachineInstr *ResumeMI = + Subtarget->getFrameLowering()->emitStackProbes(*BB->getParent(), *BB, MI, + DL, false); + MachineBasicBlock *ResumeBB = ResumeMI->getParent(); MI->eraseFromParent(); // The pseudo instruction is gone now. - return BB; + return ResumeBB; } MachineBasicBlock * Index: test/CodeGen/X86/dynamic-alloca-in-entry.ll =================================================================== --- test/CodeGen/X86/dynamic-alloca-in-entry.ll +++ test/CodeGen/X86/dynamic-alloca-in-entry.ll @@ -6,7 +6,7 @@ ret void } ; CHECK-LABEL: _foo: -; CHECK: calll __chkstk +; CHECK: or{{.}} $0, {{.*}} ; CHECK: retl ; Use of inalloca implies that that the alloca is not static. @@ -15,5 +15,5 @@ ret void } ; CHECK-LABEL: _bar: -; CHECK: calll __chkstk +; CHECK: or{{.}} $0, {{.*}} ; CHECK: retl Index: test/CodeGen/X86/inalloca-ctor.ll =================================================================== --- test/CodeGen/X86/inalloca-ctor.ll +++ test/CodeGen/X86/inalloca-ctor.ll @@ -13,7 +13,7 @@ %args = alloca inalloca %frame %c = getelementptr %frame, %frame* %args, i32 0, i32 2 ; CHECK: movl $20, %eax -; CHECK: calll __chkstk +; CHECK: or{{.}} $0, {{.*}} ; CHECK: movl %esp, call void @Foo_ctor(%Foo* %c) ; CHECK: leal 12(%{{.*}}), Index: test/CodeGen/X86/inalloca-invoke.ll =================================================================== --- test/CodeGen/X86/inalloca-invoke.ll +++ test/CodeGen/X86/inalloca-invoke.ll @@ -21,7 +21,7 @@ %beg = getelementptr %frame.reverse, %frame.reverse* %rev_args, i32 0, i32 0 %end = getelementptr %frame.reverse, %frame.reverse* %rev_args, i32 0, i32 1 -; CHECK: calll __chkstk +; CHECK: or{{.}} $0, {{.*}} ; CHECK: movl %esp, %[[beg:[^ ]*]] ; CHECK: leal 12(%[[beg]]), %[[end:[^ ]*]] Index: test/CodeGen/X86/inalloca-stdcall.ll =================================================================== --- test/CodeGen/X86/inalloca-stdcall.ll +++ test/CodeGen/X86/inalloca-stdcall.ll @@ -9,7 +9,7 @@ ; CHECK-LABEL: _g: %b = alloca inalloca %Foo ; CHECK: movl $8, %eax -; CHECK: calll __chkstk +; CHECK: or{{.}} $0, {{.*}} %f1 = getelementptr %Foo, %Foo* %b, i32 0, i32 0 %f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1 store i32 13, i32* %f1 Index: test/CodeGen/X86/inalloca.ll =================================================================== --- test/CodeGen/X86/inalloca.ll +++ test/CodeGen/X86/inalloca.ll @@ -9,7 +9,7 @@ entry: %b = alloca inalloca %Foo ; CHECK: movl $8, %eax -; CHECK: calll __chkstk +; CHECK: or{{.}} $0, {{.*}} %f1 = getelementptr %Foo, %Foo* %b, i32 0, i32 0 %f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1 store i32 13, i32* %f1 @@ -28,7 +28,7 @@ entry: %b = alloca inalloca %Foo ; CHECK: movl $8, %eax -; CHECK: calll __chkstk +; CHECK: or{{.}} $0, {{.*}} %f1 = getelementptr %Foo, %Foo* %b, i32 0, i32 0 %f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1 store i32 13, i32* %f1 @@ -48,7 +48,7 @@ entry: %b = alloca inalloca %Foo ; CHECK: movl $8, %eax -; CHECK: calll __chkstk +; CHECK: or{{.}} $0, {{.*}} %f1 = getelementptr %Foo, %Foo* %b, i32 0, i32 0 %f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1 store i32 13, i32* %f1 Index: test/CodeGen/X86/mem-intrin-base-reg.ll =================================================================== --- test/CodeGen/X86/mem-intrin-base-reg.ll +++ test/CodeGen/X86/mem-intrin-base-reg.ll @@ -65,7 +65,7 @@ ; CHECK: movl %esp, %esi ; CHECK: pushl $128 ; CHECK: calll _memcpy -; CHECK: calll __chkstk +; CHECK: or{{.}} $0, {{.*}} ; stosd doesn't clobber esi, so we can use it. Index: test/CodeGen/X86/mingw-alloca.ll =================================================================== --- test/CodeGen/X86/mingw-alloca.ll +++ test/CodeGen/X86/mingw-alloca.ll @@ -6,9 +6,9 @@ define void @foo1(i32 %N) nounwind { entry: ; COFF: _foo1: -; COFF: calll __alloca +; COFF: or{{.}} $0, {{.*}} ; ELF: foo1: -; ELF: calll _alloca +; ELF: or{{.}} $0, {{.*}} %tmp14 = alloca i32, i32 %N ; [#uses=1] call void @bar1( i32* %tmp14 ) ret void @@ -20,14 +20,10 @@ entry: ; COFF: _foo2: ; COFF: andl $-16, %esp -; COFF: pushl %eax -; COFF: calll __alloca -; COFF: movl 8028(%esp), %eax +; COFF: or{{.}} $0, {{.*}} ; ELF: foo2: ; ELF: andl $-16, %esp -; ELF: pushl %eax -; ELF: calll _alloca -; ELF: movl 8028(%esp), %eax +; ELF: or{{.}} $0, {{.*}} %A2 = alloca [2000 x i32], align 16 ; <[2000 x i32]*> [#uses=1] %A2.sub = getelementptr [2000 x i32], [2000 x i32]* %A2, i32 0, i32 0 ; [#uses=1] call void @bar2( i32* %A2.sub, i32 %N ) Index: test/CodeGen/X86/movtopush.ll =================================================================== --- test/CodeGen/X86/movtopush.ll +++ test/CodeGen/X86/movtopush.ll @@ -67,7 +67,6 @@ ; If we have a reserved frame, we should have pushes ; NORMAL-LABEL: test2: -; NORMAL-NOT: subl {{.*}} %esp ; NORMAL: pushl $4 ; NORMAL-NEXT: pushl $3 ; NORMAL-NEXT: pushl $2 Index: test/CodeGen/X86/pr17631.ll =================================================================== --- test/CodeGen/X86/pr17631.ll +++ test/CodeGen/X86/pr17631.ll @@ -18,7 +18,7 @@ ; CHECK: equal ; CHECK-NOT: vzeroupper -; CHECK: _chkstk +; CHECK: or{{.}} $0, {{.*}} ; CHECK: ret define <8 x float> @foo(<8 x float> %y, i64* %p, double %x) { Index: test/CodeGen/X86/stack-probe-size.ll =================================================================== --- test/CodeGen/X86/stack-probe-size.ll +++ test/CodeGen/X86/stack-probe-size.ll @@ -11,17 +11,6 @@ target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32" -define i32 @test1() "stack-probe-size"="0" { - %buffer = alloca [4095 x i8] - - ret i32 0 - -; CHECK-LABEL: _test1: -; CHECK-NOT: subl $4095, %esp -; CHECK: movl $4095, %eax -; CHECK: calll __chkstk -} - define i32 @test2() { %buffer = alloca [4095 x i8] @@ -30,7 +19,7 @@ ; CHECK-LABEL: _test2: ; CHECK-NOT: movl $4095, %eax ; CHECK: subl $4095, %esp -; CHECK-NOT: calll __chkstk +; CHECK-NOT: or{{.}} $0, {{.*}} } define i32 @test3() "stack-probe-size"="8192" { @@ -41,7 +30,7 @@ ; CHECK-LABEL: _test3: ; CHECK-NOT: movl $4095, %eax ; CHECK: subl $4095, %esp -; CHECK-NOT: calll __chkstk +; CHECK-NOT: or{{.}} $0, {{.*}} } define i32 @test4() "stack-probe-size"="0" { @@ -51,8 +40,7 @@ ; CHECK-LABEL: _test4: ; CHECK-NOT: subl $4096, %esp -; CHECK: movl $4096, %eax -; CHECK: calll __chkstk +; CHECK: or{{.}} $0, {{.*}} } define i32 @test5() { @@ -62,8 +50,7 @@ ; CHECK-LABEL: _test5: ; CHECK-NOT: subl $4096, %esp -; CHECK: movl $4096, %eax -; CHECK: calll __chkstk +; CHECK: or{{.}} $0, {{.*}} } define i32 @test6() "stack-probe-size"="8192" { @@ -74,5 +61,5 @@ ; CGECK-LABEL: _test6: ; CGECK-NOT: movl $4096, %eax ; CGECK: subl $4096, %esp -; CGECK-NOT: calll __chkstk +; CGECK-NOT: or{{.}} $0, {{.*}} } Index: test/CodeGen/X86/stack-probes.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/stack-probes.ll @@ -0,0 +1,54 @@ +; RUN: llc -mtriple=i386-pc-linux-gnu < %s -o - | FileCheck --check-prefix=X86-LINUX %s +; RUN: llc -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck --check-prefix=X64-LINUX %s + +declare void @use([40096 x i8]*) + +; Ensure calls to __probestack occur for large stack frames +define void @test() "probe-stack" { + %array = alloca [40096 x i8] + call void @use([40096 x i8]* %array) + ret void + +; X86-LINUX-LABEL: test: +; X86-LINUX: movl $40108, %eax # imm = 0x9CAC +; X86-LINUX-NEXT: movl %eax, %edx +; X86-LINUX-NEXT: movl %esp, %ecx +; X86-LINUX-LABEL: .LBB0_1 +; X86-LINUX-NEXT: orl $0, (%ecx) +; X86-LINUX-NEXT: subl $4096, %ecx +; X86-LINUX-NEXT: subl $4096, %edx +; X86-LINUX-NEXT: jae .LBB0_1 +; X86-LINUX: subl %eax, %esp +; X86-LINUX: addl $40108, %esp # imm = 0x9CAC + +; X64-LINUX-LABEL: test: +; X64-LINUX: movl $40104, %eax # imm = 0x9CA8 +; X64-LINUX-NEXT: movq %rax, %rdx +; X64-LINUX-NEXT: movq %rsp, %rcx +; X64-LINUX-LABEL: .LBB0_1 +; X64-LINUX-NEXT: orq $0, (%rcx) +; X64-LINUX-NEXT: subq $4096, %rcx +; X64-LINUX-NEXT: subq $4096, %rdx +; X64-LINUX-NEXT: jae .LBB0_1 +; X64-LINUX: subq %rax, %rsp +; X64-LINUX: addq $40104, %rsp # imm = 0x9CA8 + +} + +declare void @useFast([4096 x i8]*) + +; Ensure the stack is probed for medium stack frames +define void @testFast() "probe-stack" { + %array = alloca [4096 x i8] + call void @useFast([4096 x i8]* %array) + ret void + +; X86-LINUX-LABEL: testFast: +; X86-LINUX: orl $0, -4096(%esp) +; X86-LINUX-NEXT: subl $4108, %esp # imm = 0x100C + +; X64-LINUX-LABEL: testFast: +; X64-LINUX: orq $0, -4096(%rsp) +; X64-LINUX-NEXT: subq $4104, %rsp # imm = 0x1008 + +} Index: test/CodeGen/X86/win64_alloca_dynalloca.ll =================================================================== --- test/CodeGen/X86/win64_alloca_dynalloca.ll +++ test/CodeGen/X86/win64_alloca_dynalloca.ll @@ -1,6 +1,5 @@ ; RUN: llc < %s -mcpu=generic -enable-misched=false -mtriple=x86_64-mingw32 | FileCheck %s -check-prefix=M64 ; RUN: llc < %s -mcpu=generic -enable-misched=false -mtriple=x86_64-win32 | FileCheck %s -check-prefix=W64 -; RUN: llc < %s -mcpu=generic -enable-misched=false -mtriple=x86_64-win32 -code-model=large | FileCheck %s -check-prefix=L64 ; RUN: llc < %s -mcpu=generic -enable-misched=false -mtriple=x86_64-win32-macho | FileCheck %s -check-prefix=EFI ; PR8777 ; PR8778 @@ -14,23 +13,15 @@ %buf0 = alloca i8, i64 4096, align 1 ; ___chkstk_ms does not adjust %rsp. -; M64: $4096, %eax -; M64: callq ___chkstk_ms -; M64: subq %rax, %rsp +; M64: or{{.}} $0, {{.*}} +; M64: subq {{.*}}, %rsp ; M64: leaq 128(%rsp), %rbp ; __chkstk does not adjust %rsp. -; W64: $4096, %eax -; W64: callq __chkstk -; W64: subq %rax, %rsp +; W64: or{{.}} $0, {{.*}} +; W64: subq {{.*}}, %rsp ; W64: leaq 128(%rsp), %rbp -; Use %r11 for the large model. -; L64: $4096, %eax -; L64: movabsq $__chkstk, %r11 -; L64: callq *%r11 -; L64: subq %rax, %rsp - ; Freestanding ; EFI: $[[B0OFS:4096|4104]], %rsp ; EFI-NOT: call @@ -39,23 +30,16 @@ ; M64: leaq 15(%{{.*}}), %rax ; M64: andq $-16, %rax -; M64: callq ___chkstk_ms -; M64: subq %rax, %rsp +; M64: or{{.}} $0, {{.*}} +; M64: subq {{.*}}, %rsp ; M64: movq %rsp, %rax ; W64: leaq 15(%{{.*}}), %rax ; W64: andq $-16, %rax -; W64: callq __chkstk -; W64: subq %rax, %rsp +; W64: or{{.}} $0, {{.*}} +; W64: subq {{.*}}, %rsp ; W64: movq %rsp, %rax -; L64: leaq 15(%{{.*}}), %rax -; L64: andq $-16, %rax -; L64: movabsq $__chkstk, %r11 -; L64: callq *%r11 -; L64: subq %rax, %rsp -; L64: movq %rsp, %rax - ; EFI: leaq 15(%{{.*}}), [[R1:%r.*]] ; EFI: andq $-16, [[R1]] ; EFI: movq %rsp, [[R64:%r.*]] @@ -97,16 +81,16 @@ ; M64: leaq 15(%{{.*}}), %rax ; M64: andq $-16, %rax -; M64: callq ___chkstk_ms -; M64: subq %rax, %rsp +; M64: or{{.}} $0, {{.*}} +; M64: subq {{.*}}, %rsp ; M64: movq %rsp, [[R2:%r.*]] ; M64: andq $-128, [[R2]] ; M64: movq [[R2]], %rsp ; W64: leaq 15(%{{.*}}), %rax ; W64: andq $-16, %rax -; W64: callq __chkstk -; W64: subq %rax, %rsp +; W64: or{{.}} $0, {{.*}} +; W64: subq {{.*}}, %rsp ; W64: movq %rsp, [[R2:%r.*]] ; W64: andq $-128, [[R2]] ; W64: movq [[R2]], %rsp Index: test/CodeGen/X86/win64_eh.ll =================================================================== --- test/CodeGen/X86/win64_eh.ll +++ test/CodeGen/X86/win64_eh.ll @@ -37,9 +37,7 @@ } ; WIN64-LABEL: foo2: ; WIN64: .seh_proc foo2 -; WIN64: movl $8000, %eax -; WIN64: callq {{__chkstk|___chkstk_ms}} -; WIN64: subq %rax, %rsp +; WIN64: or{{.}} $0, {{.*}} ; WIN64: .seh_stackalloc 8000 ; WIN64: .seh_endprologue ; WIN64: addq $8000, %rsp Index: test/CodeGen/X86/win64_frame.ll =================================================================== --- test/CodeGen/X86/win64_frame.ll +++ test/CodeGen/X86/win64_frame.ll @@ -103,7 +103,7 @@ ; CHECK: leaq 15(,%rax,4), %rcx ; CHECK: movabsq $34359738352, %rax ; CHECK: andq %rcx, %rax - ; CHECK: callq __chkstk + ; CHECK: or{{.}} $0, {{.*}} ; CHECK: subq %rax, %rsp %gep = getelementptr [300 x i8], [300 x i8]* %alloca, i32 0, i32 0 Index: test/CodeGen/X86/win_chkstk.ll =================================================================== --- test/CodeGen/X86/win_chkstk.ll +++ test/CodeGen/X86/win_chkstk.ll @@ -1,8 +1,7 @@ -; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s -check-prefix=WIN_X32 -; RUN: llc < %s -mtriple=x86_64-pc-win32 | FileCheck %s -check-prefix=WIN_X64 -; RUN: llc < %s -mtriple=x86_64-pc-win32 -code-model=large | FileCheck %s -check-prefix=WIN64_LARGE -; RUN: llc < %s -mtriple=i686-pc-mingw32 | FileCheck %s -check-prefix=MINGW_X32 -; RUN: llc < %s -mtriple=x86_64-pc-mingw32 | FileCheck %s -check-prefix=MINGW_X64 +; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s -check-prefix=WIN +; RUN: llc < %s -mtriple=x86_64-pc-win32 | FileCheck %s -check-prefix=WIN +; RUN: llc < %s -mtriple=i686-pc-mingw32 | FileCheck %s -check-prefix=WIN +; RUN: llc < %s -mtriple=x86_64-pc-mingw32 | FileCheck %s -check-prefix=WIN ; RUN: llc < %s -mtriple=i386-pc-linux | FileCheck %s -check-prefix=LINUX ; RUN: llc < %s -mtriple=x86_64-pc-win32-macho | FileCheck %s -check-prefix=LINUX @@ -15,13 +14,8 @@ ; Stack allocation >= 4096 bytes will require call to __chkstk in the Windows ABI. define i32 @main4k() nounwind { entry: -; WIN_X32: calll __chkstk -; WIN_X64: callq __chkstk -; WIN64_LARGE: movabsq $__chkstk, %r11 -; WIN64_LARGE: callq *%r11 -; MINGW_X32: calll __alloca -; MINGW_X64: callq ___chkstk_ms -; LINUX-NOT: call __chkstk +; WIN: or{{.}} $0, {{.*}} +; LINUX-NOT: or{{[ql]}} $0, {{.*}} %array4096 = alloca [4096 x i8], align 16 ; <[4096 x i8]*> [#uses=0] ret i32 0 } @@ -30,21 +24,8 @@ ; allocation. define i32 @main128() nounwind { entry: -; WIN_X32: # BB#0: -; WIN_X32-NOT: calll __chkstk -; WIN_X32: ret - -; WIN_X64: # BB#0: -; WIN_X64-NOT: callq __chkstk -; WIN_X64: ret - -; MINGW_X64: # BB#0: -; MINGW_X64-NOT: callq ___chkstk_ms -; MINGW_X64: ret - -; LINUX: # BB#0: -; LINUX-NOT: call __chkstk -; LINUX: ret +; WIN-NOT: or{{.}} $0, {{.*}} +; LINUX-NOT: or{{.}} $0, {{.*}} %array128 = alloca [128 x i8], align 16 ; <[128 x i8]*> [#uses=0] ret i32 0 } @@ -53,13 +34,8 @@ ; caller has the Win64 calling convention. define x86_64_win64cc i32 @main4k_win64() nounwind { entry: -; WIN_X32: calll __chkstk -; WIN_X64: callq __chkstk -; WIN64_LARGE: movabsq $__chkstk, %r11 -; WIN64_LARGE: callq *%r11 -; MINGW_X32: calll __alloca -; MINGW_X64: callq ___chkstk_ms -; LINUX-NOT: call __chkstk +; WIN: or{{.}} $0, {{.*}} +; LINUX-NOT: or{{.}} $0, {{.*}} %array4096 = alloca [4096 x i8], align 16 ; <[4096 x i8]*> [#uses=0] ret i32 0 } Index: test/CodeGen/X86/windows-itanium-alloca.ll =================================================================== --- test/CodeGen/X86/windows-itanium-alloca.ll +++ test/CodeGen/X86/windows-itanium-alloca.ll @@ -12,5 +12,4 @@ ret void } -; CHECK: __chkstk - +; CHECK: orl $0, (%{{.*}})