Index: include/llvm/CodeGen/MachineFunction.h =================================================================== --- include/llvm/CodeGen/MachineFunction.h +++ include/llvm/CodeGen/MachineFunction.h @@ -504,6 +504,14 @@ /// Should we be emitting segmented stack stuff for the function bool shouldSplitStack() const; + /// \brief Whether we should be probing the stack for the function. + /// + /// Probing the stack means that we must read or write to the stack on every + /// page. This is to ensure that a guard page will be hit and stack overflow + /// can be detected. We insert instructions to do this when allocating from + /// the stack. + bool shouldProbeStack() const; + /// getNumBlockIDs - Return the number of MBB ID's allocated. unsigned getNumBlockIDs() const { return (unsigned)MBBNumbering.size(); } Index: lib/CodeGen/MachineFunction.cpp =================================================================== --- lib/CodeGen/MachineFunction.cpp +++ lib/CodeGen/MachineFunction.cpp @@ -213,6 +213,10 @@ return getFunction()->hasFnAttribute("split-stack"); } +bool MachineFunction::shouldProbeStack() const { + return getFunction()->hasFnAttribute("probe-stack"); +} + /// This discards all of the MachineBasicBlock numbers and recomputes them. /// This guarantees that the MBB numbers are sequential, dense, and match the /// ordering of the blocks within the function. If a specific MachineBasicBlock Index: lib/Target/X86/X86FrameLowering.h =================================================================== --- lib/Target/X86/X86FrameLowering.h +++ lib/Target/X86/X86FrameLowering.h @@ -48,6 +48,26 @@ unsigned StackPtr; + /// Emits code to push a caller-save register before a call to the stack + /// probing function. + void pushRegForStackProbeCall(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, + bool &IsAlive, + unsigned RegType, + uint64_t &NumBytes) const; + + /// Emits code to pop a caller-save register after a call to the stack + /// probing function. + void popRegForStackProbeCall(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, + bool &IsAlive, + unsigned RegType, + uint64_t &NumBytes) const; + /// Emit target stack probe code. This is required for all /// large stack allocations on Windows. The caller is required to materialize /// the number of bytes to probe in RAX/EAX. @@ -161,9 +181,9 @@ /// that uses sub and mov instructions to put the argument onto the stack /// into a series of pushes. /// Returns true if the transformation succeeded, false if not. - bool convertArgMovsToPushes(MachineFunction &MF, + bool convertArgMovsToPushes(MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, + MachineBasicBlock::iterator I, uint64_t Amount) const; /// Wraps up getting a CFI index and building a MachineInstr for it. Index: lib/Target/X86/X86FrameLowering.cpp =================================================================== --- lib/Target/X86/X86FrameLowering.cpp +++ lib/Target/X86/X86FrameLowering.cpp @@ -68,7 +68,7 @@ // needsFrameIndexResolution - Do we need to perform FI resolution for // this function. Normally, this is required only when the function // has any stack objects. However, FI resolution actually has another job, -// not apparent from the title - it resolves callframesetup/destroy +// not apparent from the title - it resolves callframesetup/destroy // that were not simplified earlier. // So, this is required for x86 functions that have push sequences even // when there are no stack objects. @@ -194,12 +194,13 @@ return 0; } -static bool isEAXLiveIn(MachineBasicBlock &MBB) { +static bool isLiveIn(MachineBasicBlock &MBB, unsigned CheckReg) { + CheckReg = getX86SubSuperRegister(CheckReg, MVT::i32); + for (MachineBasicBlock::RegisterMaskPair RegMask : MBB.liveins()) { unsigned Reg = RegMask.PhysReg; - if (Reg == X86::RAX || Reg == X86::EAX || Reg == X86::AX || - Reg == X86::AH || Reg == X86::AL) + if (getX86SubSuperRegisterOrZero(Reg, MVT::i32) == CheckReg) return true; } @@ -264,7 +265,7 @@ unsigned Reg = 0; unsigned Rax = (unsigned)(Is64Bit ? X86::RAX : X86::EAX); - if (isSub && !isEAXLiveIn(MBB)) + if (isSub && !isLiveIn(MBB, X86::EAX)) Reg = Rax; else Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit); @@ -478,6 +479,55 @@ } } +void X86FrameLowering::pushRegForStackProbeCall(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, + bool &IsAlive, + unsigned RegType, + uint64_t &NumBytes) const { + IsAlive = MBB.isLiveIn(RegType); + + if (!IsAlive) { + return; + } + + auto Reg = getX86SubSuperRegister(RegType, Is64Bit ? MVT::i64 : MVT::i32); + + // Save the register on the stack. + BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r)) + .addReg(Reg, RegState::Kill) + .setMIFlag(MachineInstr::FrameSetup); + + // Reuse the space from the spill as a stack allocation. + NumBytes -= SlotSize; +} + +void X86FrameLowering::popRegForStackProbeCall(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, + bool &IsAlive, + unsigned RegType, + uint64_t &NumBytes) const { + if (!IsAlive) { + return; + } + + // Restore the register from the stack slot. + + auto Reg = getX86SubSuperRegister(RegType, Is64Bit ? MVT::i64 : MVT::i32); + + auto MIB = BuildMI(MF, DL, + TII.get(Is64Bit ? X86::MOV64rm : X86::MOV32rm), + Reg); + MachineInstr *MI = addRegOffset(MIB, StackPtr, false, NumBytes); + MI->setFlag(MachineInstr::FrameSetup); + MBB.insert(MBBI, MI); + + NumBytes += SlotSize; +} + void X86FrameLowering::emitStackProbe(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, @@ -604,7 +654,7 @@ int64_t RCXShadowSlot = 0; int64_t RDXShadowSlot = 0; - // If inlining in the prolog, save RCX and RDX. + // If inlining in the prolog, save RCX and RDX. // Future optimization: don't save or restore if not live in. if (InProlog) { // Compute the offsets. We need to account for things already @@ -748,8 +798,10 @@ else CallOp = X86::CALLpcrel32; - const char *Symbol; - if (Is64Bit) { + std::string Symbol; + if (MF.getFunction()->hasFnAttribute("probe-stack")) { + Symbol = MF.getFunction()->getFnAttribute("probe-stack").getValueAsString().str(); + } else if (Is64Bit) { if (STI.isTargetCygMing()) { Symbol = "___chkstk_ms"; } else { @@ -769,10 +821,10 @@ // For the large code model, we have to call through a register. Use R11, // as it is scratch in all supported calling conventions. BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::R11) - .addExternalSymbol(Symbol); + .addExternalSymbol(Symbol.c_str()); CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addReg(X86::R11); } else { - CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addExternalSymbol(Symbol); + CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addExternalSymbol(Symbol.c_str()); } unsigned AX = Is64Bit ? X86::RAX : X86::EAX; @@ -783,13 +835,13 @@ .addReg(SP, RegState::Define | RegState::Implicit) .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); - if (Is64Bit) { + if (!STI.isTargetWin32()) { // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp // themselves. It also does not clobber %rax so we can reuse it when // adjusting %rsp. - BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), X86::RSP) - .addReg(X86::RSP) - .addReg(X86::RAX); + BuildMI(MBB, MBBI, DL, TII.get(getSUBrrOpcode(Is64Bit)), SP) + .addReg(SP) + .addReg(AX); } if (InProlog) { @@ -964,7 +1016,7 @@ ? getX86SubSuperRegister(FramePtr, 64) : FramePtr; unsigned BasePtr = TRI->getBaseRegister(); bool HasWinCFI = false; - + // Debug location must be unknown since the first debug location is used // to determine the end of the prologue. DebugLoc DL; @@ -978,7 +1030,9 @@ X86FI->setCalleeSavedFrameSize( X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta); - bool UseStackProbe = (STI.isOSWindows() && !STI.isTargetMachO()); + bool UseRedZone = false; + bool UseStackProbe = + (STI.isOSWindows() && !STI.isTargetMachO()) || MF.shouldProbeStack(); // The default stack probe size is 4096 if the function has no stackprobesize // attribute. @@ -1008,6 +1062,10 @@ !MFI.hasVarSizedObjects() && // No dynamic alloca. !MFI.adjustsStack() && // No calls. !IsWin64CC && // Win64 has no Red Zone + !(UseStackProbe && StackSize > 128) && // Only use the Red Zone if we can + // fit the whole stack in it + // and thus stack probes won't be + // needed !MFI.hasCopyImplyingStackAdjustment() && // Don't push and pop. !MF.shouldSplitStack()) { // Regular stack uint64_t MinSize = X86FI->getCalleeSavedFrameSize(); @@ -1015,6 +1073,7 @@ X86FI->setUsesRedZone(MinSize > 0 || StackSize > 0); StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0); MFI.setStackSize(StackSize); + UseRedZone = true; } // Insert stack pointer adjustment for later moving of return addr. Only @@ -1192,18 +1251,26 @@ if (IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF)) AlignedNumBytes = alignTo(AlignedNumBytes, MaxAlign); if (AlignedNumBytes >= StackProbeSize && UseStackProbe) { - // Check whether EAX is livein for this block. - bool isEAXAlive = isEAXLiveIn(MBB); + assert(!UseRedZone && "The Red Zone is not accounted for in stack probes"); - if (isEAXAlive) { - // Sanity check that EAX is not livein for this function. - // It should not be, so throw an assert. - assert(!Is64Bit && "EAX is livein in x64 case!"); + // In the large code model, we have to load the stack probe function into a scratch + // register to call it. R11 is used for that. + bool SpillR11 = Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large; - // Save EAX - BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r)) - .addReg(X86::EAX, RegState::Kill) - .setMIFlag(MachineInstr::FrameSetup); + // We spill the registers we need to call the stack probe function. + + bool RAXAlive, RBXAlive, R11Alive; + + pushRegForStackProbeCall(MF, MBB, MBBI, DL, RAXAlive, X86::RAX, NumBytes); + pushRegForStackProbeCall(MF, MBB, MBBI, DL, RBXAlive, X86::RBX, NumBytes); + if (SpillR11) { + pushRegForStackProbeCall(MF, MBB, MBBI, DL, R11Alive, X86::R11, NumBytes); + } + + if (!STI.isOSWindows()) { + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EDX) + .addImm(0x1000) + .setMIFlag(MachineInstr::FrameSetup); } if (Is64Bit) { @@ -1223,24 +1290,20 @@ .setMIFlag(MachineInstr::FrameSetup); } } else { - // Allocate NumBytes-4 bytes on stack in case of isEAXAlive. - // We'll also use 4 already allocated bytes for EAX. BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) - .addImm(isEAXAlive ? NumBytes - 4 : NumBytes) + .addImm(NumBytes) .setMIFlag(MachineInstr::FrameSetup); } // Call __chkstk, __chkstk_ms, or __alloca. emitStackProbe(MF, MBB, MBBI, DL, true); - if (isEAXAlive) { - // Restore EAX - MachineInstr *MI = - addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), X86::EAX), - StackPtr, false, NumBytes - 4); - MI->setFlag(MachineInstr::FrameSetup); - MBB.insert(MBBI, MI); + // Now we restore the spilled registers from the stack + if (SpillR11) { + popRegForStackProbeCall(MF, MBB, MBBI, DL, R11Alive, X86::R11, NumBytes); } + popRegForStackProbeCall(MF, MBB, MBBI, DL, RBXAlive, X86::RBX, NumBytes); + popRegForStackProbeCall(MF, MBB, MBBI, DL, RAXAlive, X86::RAX, NumBytes); } else if (NumBytes) { emitSPUpdate(MBB, MBBI, -(int64_t)NumBytes, /*InEpilogue=*/false); } @@ -2607,7 +2670,7 @@ Regs[FoundRegs++] = Regs[0]; for (int i = 0; i < NumPops; ++i) - BuildMI(MBB, MBBI, DL, + BuildMI(MBB, MBBI, DL, TII.get(STI.is64Bit() ? X86::POP64r : X86::POP32r), Regs[i]); return true; @@ -2639,7 +2702,7 @@ MachineModuleInfo &MMI = MF.getMMI(); const Function *Fn = MF.getFunction(); bool WindowsCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); - bool DwarfCFI = !WindowsCFI && + bool DwarfCFI = !WindowsCFI && (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry()); // If we have any exception handlers in this function, and we adjust @@ -2888,7 +2951,7 @@ // in general. Something to keep in mind, though. if (DensityAScaled == DensityBScaled) return A.ObjectAlignment < B.ObjectAlignment; - + return DensityAScaled < DensityBScaled; } }; @@ -2924,7 +2987,7 @@ if (ObjectSize == 0) // Variable size. Just use 4. SortingObjects[Obj].ObjectSize = 4; - else + else SortingObjects[Obj].ObjectSize = ObjectSize; } Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -18649,7 +18649,7 @@ MachineFunction &MF = DAG.getMachineFunction(); bool SplitStack = MF.shouldSplitStack(); bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) || - SplitStack; + SplitStack || MF.shouldProbeStack(); SDLoc dl(Op); // Get the inputs.