Index: include/llvm/Target/TargetFrameLowering.h =================================================================== --- include/llvm/Target/TargetFrameLowering.h +++ include/llvm/Target/TargetFrameLowering.h @@ -158,6 +158,10 @@ virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const = 0; + /// Replace a StackProbe stub (if any) with the actual probe code inline + virtual void inlineStackProbe(MachineFunction &MF, + MachineBasicBlock &PrologueMBB) const {} + /// Adjust the prologue to have the function use segmented stacks. This works /// by adding a check even before the "normal" function prologue. virtual void adjustForSegmentedStacks(MachineFunction &MF, Index: lib/CodeGen/PrologEpilogInserter.cpp =================================================================== --- lib/CodeGen/PrologEpilogInserter.cpp +++ lib/CodeGen/PrologEpilogInserter.cpp @@ -781,6 +781,9 @@ for (MachineBasicBlock *RestoreBlock : RestoreBlocks) TFI.emitEpilogue(Fn, *RestoreBlock); + for (MachineBasicBlock *SaveBlock : SaveBlocks) + TFI.inlineStackProbe(Fn, *SaveBlock); + // Emit additional code that is required to support segmented stacks, if // we've been asked for it. This, when linked with a runtime with support // for segmented stacks (libgcc is one), will result in allocating stack Index: lib/Target/X86/X86FrameLowering.h =================================================================== --- lib/Target/X86/X86FrameLowering.h +++ lib/Target/X86/X86FrameLowering.h @@ -47,11 +47,17 @@ unsigned StackPtr; - /// Emit a call to the target's stack probe function. This is required for all + /// Emit target stack probe code. This is required for all /// large stack allocations on Windows. The caller is required to materialize - /// the number of bytes to probe in RAX/EAX. - void emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, DebugLoc DL) const; + /// the number of bytes to probe in RAX/EAX. Returns instruction just + /// after the expansion. + MachineInstr *emitStackProbe(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, + bool InProlog) const; + + /// Replace a StackProbe inline-stub with the actual probe code inline. + void inlineStackProbe(MachineFunction &MF, + MachineBasicBlock &PrologMBB) const override; void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, @@ -125,6 +131,15 @@ /// \p MBB will be correctly handled by the target. bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override; + /// convertArgMovsToPushes - This method tries to convert a call sequence + /// that uses sub and mov instructions to put the argument onto the stack + /// into a series of pushes. + /// Returns true if the transformation succeeded, false if not. + bool convertArgMovsToPushes(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + uint64_t Amount) const; + /// Wraps up getting a CFI index and building a MachineInstr for it. void BuildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL, MCCFIInstruction CFIInst) const; @@ -139,6 +154,23 @@ private: uint64_t calculateMaxStackAlign(const MachineFunction &MF) const; + /// Emit target stack probe as a call to a helper function + MachineInstr *emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, bool InProlog) const; + + /// Emit target stack probe as an inline sequence. + MachineInstr *emitStackProbeInline(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, bool InProlog) const; + + /// Emit a stub to later inline the target stack probe. + MachineInstr *emitStackProbeInlineStub(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, bool InProlog) const; + /// Aligns the stack pointer by ANDing it with -MaxAlign. void BuildStackAlignAND(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL, Index: lib/Target/X86/X86FrameLowering.cpp =================================================================== --- lib/Target/X86/X86FrameLowering.cpp +++ lib/Target/X86/X86FrameLowering.cpp @@ -431,10 +431,257 @@ return false; } -void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - DebugLoc DL) const { +MachineInstr *X86FrameLowering::emitStackProbe(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, + bool InProlog) const { + const X86Subtarget &STI = MF.getSubtarget(); + if (STI.isTargetWindowsCoreCLR()) { + if (InProlog) { + return emitStackProbeInlineStub(MF, MBB, MBBI, DL, true); + } else { + return emitStackProbeInline(MF, MBB, MBBI, DL, false); + } + } else { + return emitStackProbeCall(MF, MBB, MBBI, DL, InProlog); + } +} + +void X86FrameLowering::inlineStackProbe(MachineFunction &MF, + MachineBasicBlock &PrologMBB) const { + const StringRef ChkStkStubSymbol = "__chkstk_stub"; + MachineInstr *ChkStkStub = nullptr; + + for (MachineInstr &MI : PrologMBB) { + if (MI.isCall() && MI.getOperand(0).isSymbol() && + ChkStkStubSymbol == MI.getOperand(0).getSymbolName()) { + ChkStkStub = &MI; + break; + } + } + + if (ChkStkStub != nullptr) { + MachineBasicBlock::iterator MBBI = std::next(ChkStkStub->getIterator()); + assert(std::prev(MBBI).operator==(ChkStkStub) && + "MBBI expected after __chkstk_stub."); + DebugLoc DL = PrologMBB.findDebugLoc(MBBI); + emitStackProbeInline(MF, PrologMBB, MBBI, DL, true); + ChkStkStub->eraseFromParent(); + } +} + +MachineInstr *X86FrameLowering::emitStackProbeInline( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const { + const X86Subtarget &STI = MF.getSubtarget(); + assert(STI.is64Bit() && "different expansion needed for 32 bit"); + assert(STI.isTargetWindowsCoreCLR() && "custom expansion expects CoreCLR"); + const TargetInstrInfo &TII = *STI.getInstrInfo(); + const BasicBlock *LLVM_BB = MBB.getBasicBlock(); + + // RAX contains the number of bytes of desired stack adjustment. + // The handling here assumes this value has already been updated so as to + // maintain stack alignment. + // + // We need to exit with RSP modified by this amount and execute suitable + // page touches to notify the OS that we're growing the stack responsibly. + // All stack probing must be done without modifying RSP. + // + // MBB: + // SizeReg = RAX; + // ZeroReg = 0 + // CopyReg = RSP + // Flags, TestReg = CopyReg - SizeReg + // FinalReg = !Flags.Ovf ? TestReg : ZeroReg + // LimitReg = gs magic thread env access + // if FinalReg >= LimitReg goto ContinueMBB + // RoundBB: + // RoundReg = page address of FinalReg + // LoopMBB: + // LoopReg = PHI(LimitReg,ProbeReg) + // ProbeReg = LoopReg - PageSize + // [ProbeReg] = 0 + // if (ProbeReg > RoundReg) goto LoopMBB + // ContinueMBB: + // RSP = RSP - RAX + // [rest of original MBB] + + // Set up the new basic blocks + MachineBasicBlock *RoundMBB = MF.CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *ContinueMBB = MF.CreateMachineBasicBlock(LLVM_BB); + + MachineFunction::iterator MBBIter = std::next(MBB.getIterator()); + MF.insert(MBBIter, RoundMBB); + MF.insert(MBBIter, LoopMBB); + MF.insert(MBBIter, ContinueMBB); + + // Split MBB and move the tail portion down to ContinueMBB. + MachineBasicBlock::iterator BeforeMBBI = std::prev(MBBI); + ContinueMBB->splice(ContinueMBB->begin(), &MBB, MBBI, MBB.end()); + ContinueMBB->transferSuccessorsAndUpdatePHIs(&MBB); + + // Some useful constants + const int64_t ThreadEnvironmentStackLimit = 0x10; + const int64_t PageSize = 0x1000; + const int64_t PageMask = ~(PageSize - 1); + + // Registers we need. For the normal case we use virtual + // registers. For the prolog expansion we use RAX, RCX and RDX. + MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetRegisterClass *RegClass = &X86::GR64RegClass; + const unsigned + SizeReg = InProlog ? X86::RAX : MRI.createVirtualRegister(RegClass), + ZeroReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass), + CopyReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass), + TestReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass), + FinalReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass), + RoundedReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass), + LimitReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass), + JoinReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass), + ProbeReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass); + + // SP-relative offsets where we can save RCX and RDX. + int64_t RCXShadowSlot = 0; + int64_t RDXShadowSlot = 0; + + // If inlining in the prolog, save RCX and RDX. + // Future optimization: don't save or restore if not live in. + if (InProlog) { + // Compute the offsets. We need to account for things already + // pushed onto the stack at this point: return address, frame + // pointer (if used), and callee saves. + X86MachineFunctionInfo *X86FI = MF.getInfo(); + const int64_t CalleeSaveSize = X86FI->getCalleeSavedFrameSize(); + const bool HasFP = hasFP(MF); + RCXShadowSlot = 8 + CalleeSaveSize + (HasFP ? 8 : 0); + RDXShadowSlot = RCXShadowSlot + 8; + // Emit the saves. + addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false, + RCXShadowSlot) + .addReg(X86::RCX); + addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false, + RDXShadowSlot) + .addReg(X86::RDX); + } else { + // Not in the prolog. Copy RAX to a virtual reg. + BuildMI(&MBB, DL, TII.get(X86::MOV64rr), SizeReg).addReg(X86::RAX); + } + + // Add code to MBB to check for overflow and set the new target stack pointer + // to zero if so. + BuildMI(&MBB, DL, TII.get(X86::XOR64rr), ZeroReg) + .addReg(ZeroReg, RegState::Undef) + .addReg(ZeroReg, RegState::Undef); + BuildMI(&MBB, DL, TII.get(X86::MOV64rr), CopyReg).addReg(X86::RSP); + BuildMI(&MBB, DL, TII.get(X86::SUB64rr), TestReg) + .addReg(CopyReg) + .addReg(SizeReg); + BuildMI(&MBB, DL, TII.get(X86::CMOVB64rr), FinalReg) + .addReg(TestReg) + .addReg(ZeroReg); + + // FinalReg now holds final stack pointer value, or zero if + // allocation would overflow. Compare against the current stack + // limit from the thread environment block. Note this limit is the + // lowest touched page on the stack, not the point at which the OS + // will cause an overflow exception, so this is just an optimization + // to avoid unnecessarily touching pages that are below the current + // SP but already commited to the stack by the OS. + BuildMI(&MBB, DL, TII.get(X86::MOV64rm), LimitReg) + .addReg(0) + .addImm(1) + .addReg(0) + .addImm(ThreadEnvironmentStackLimit) + .addReg(X86::GS); + BuildMI(&MBB, DL, TII.get(X86::CMP64rr)).addReg(FinalReg).addReg(LimitReg); + // Jump if the desired stack pointer is at or above the stack limit. + BuildMI(&MBB, DL, TII.get(X86::JAE_1)).addMBB(ContinueMBB); + + // Add code to roundMBB to round the final stack pointer to a page boundary. + BuildMI(RoundMBB, DL, TII.get(X86::AND64ri32), RoundedReg) + .addReg(FinalReg) + .addImm(PageMask); + BuildMI(RoundMBB, DL, TII.get(X86::JMP_1)).addMBB(LoopMBB); + + // LimitReg now holds the current stack limit, RoundedReg page-rounded + // final RSP value. Add code to loopMBB to decrement LimitReg page-by-page + // and probe until we reach RoundedReg. + if (!InProlog) { + BuildMI(LoopMBB, DL, TII.get(X86::PHI), JoinReg) + .addReg(LimitReg) + .addMBB(RoundMBB) + .addReg(ProbeReg) + .addMBB(LoopMBB); + } + + addRegOffset(BuildMI(LoopMBB, DL, TII.get(X86::LEA64r), ProbeReg), JoinReg, + false, -PageSize); + + // Probe by storing a byte onto the stack. + BuildMI(LoopMBB, DL, TII.get(X86::MOV8mi)) + .addReg(ProbeReg) + .addImm(1) + .addReg(0) + .addImm(0) + .addReg(0) + .addImm(0); + BuildMI(LoopMBB, DL, TII.get(X86::CMP64rr)) + .addReg(RoundedReg) + .addReg(ProbeReg); + BuildMI(LoopMBB, DL, TII.get(X86::JNE_1)).addMBB(LoopMBB); + + MachineBasicBlock::iterator ContinueMBBI = ContinueMBB->getFirstNonPHI(); + + // If in prolog, restore RDX and RCX. + if (InProlog) { + addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm), + X86::RCX), + X86::RSP, false, RCXShadowSlot); + addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm), + X86::RDX), + X86::RSP, false, RDXShadowSlot); + } + + // Now that the probing is done, add code to continueMBB to update + // the stack pointer for real. + BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::SUB64rr), X86::RSP) + .addReg(X86::RSP) + .addReg(SizeReg); + + // Add the control flow edges we need. + MBB.addSuccessor(ContinueMBB); + MBB.addSuccessor(RoundMBB); + RoundMBB->addSuccessor(LoopMBB); + LoopMBB->addSuccessor(ContinueMBB); + LoopMBB->addSuccessor(LoopMBB); + + // Mark all the instructions added to the prolog as frame setup. + if (InProlog) { + for (++BeforeMBBI; BeforeMBBI != MBB.end(); ++BeforeMBBI) { + BeforeMBBI->setFlag(MachineInstr::FrameSetup); + } + for (MachineInstr &MI : *RoundMBB) { + MI.setFlag(MachineInstr::FrameSetup); + } + for (MachineInstr &MI : *LoopMBB) { + MI.setFlag(MachineInstr::FrameSetup); + } + for (MachineBasicBlock::iterator CMBBI = ContinueMBB->begin(); + CMBBI != ContinueMBBI; ++CMBBI) { + CMBBI->setFlag(MachineInstr::FrameSetup); + } + } + + // Possible TODO: physreg liveness for InProlog case. + + return ContinueMBBI; +} + +MachineInstr *X86FrameLowering::emitStackProbeCall( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const { bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large; unsigned CallOp; @@ -456,6 +703,7 @@ Symbol = "_chkstk"; MachineInstrBuilder CI; + MachineBasicBlock::iterator ExpansionMBBI = std::prev(MBBI); // All current stack probes take AX and SP as input, clobber flags, and // preserve all registers. x86_64 probes leave RSP unmodified. @@ -485,6 +733,26 @@ .addReg(X86::RSP) .addReg(X86::RAX); } + + if (InProlog) { + // Apply the frame setup flag to all inserted instrs. + for (++ExpansionMBBI; ExpansionMBBI != MBBI; ++ExpansionMBBI) + ExpansionMBBI->setFlag(MachineInstr::FrameSetup); + } + + return MBBI; +} + +MachineInstr *X86FrameLowering::emitStackProbeInlineStub( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const { + + assert(InProlog && "ChkStkStub called outside prolog!"); + + MachineInstrBuilder CI = BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32)) + .addExternalSymbol("__chkstk_stub"); + + return MBBI; } static unsigned calculateSetFPREG(uint64_t SPAdjust) { @@ -889,26 +1157,18 @@ // Allocate NumBytes-4 bytes on stack in case of isEAXAlive. // We'll also use 4 already allocated bytes for EAX. BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) - .addImm(isEAXAlive ? NumBytes - 4 : NumBytes) - .setMIFlag(MachineInstr::FrameSetup); + .addImm(isEAXAlive ? NumBytes - 4 : NumBytes) + .setMIFlag(MachineInstr::FrameSetup); } - // Save a pointer to the MI where we set AX. - MachineBasicBlock::iterator SetRAX = MBBI; - --SetRAX; - // Call __chkstk, __chkstk_ms, or __alloca. - emitStackProbeCall(MF, MBB, MBBI, DL); - - // Apply the frame setup flag to all inserted instrs. - for (; SetRAX != MBBI; ++SetRAX) - SetRAX->setFlag(MachineInstr::FrameSetup); + emitStackProbe(MF, MBB, MBBI, DL, true); if (isEAXAlive) { // Restore EAX - MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), - X86::EAX), - StackPtr, false, NumBytes - 4); + MachineInstr *MI = + addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), X86::EAX), + StackPtr, false, NumBytes - 4); MI->setFlag(MachineInstr::FrameSetup); MBB.insert(MBBI, MI); } Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -21380,15 +21380,13 @@ MachineBasicBlock * X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, MachineBasicBlock *BB) const { - DebugLoc DL = MI->getDebugLoc(); - assert(!Subtarget->isTargetMachO()); - - Subtarget->getFrameLowering()->emitStackProbeCall(*BB->getParent(), *BB, MI, - DL); - - MI->eraseFromParent(); // The pseudo instruction is gone now. - return BB; + DebugLoc DL = MI->getDebugLoc(); + MachineInstr *ResumeMI = Subtarget->getFrameLowering()->emitStackProbe( + *BB->getParent(), *BB, MI, DL, false); + MachineBasicBlock *ResumeBB = ResumeMI->getParent(); + MI->eraseFromParent(); // The pseudo instruction is gone now. + return ResumeBB; } MachineBasicBlock * Index: test/CodeGen/X86/stack-probe-size.ll =================================================================== --- test/CodeGen/X86/stack-probe-size.ll +++ test/CodeGen/X86/stack-probe-size.ll @@ -7,7 +7,6 @@ ; this is unlikely to change in the future. ; ; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck %s -; RUN: llc -mtriple=i686-windows-coreclr < %s | FileCheck %s target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32" Index: test/CodeGen/X86/win_coreclr_chkstk.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/win_coreclr_chkstk.ll @@ -0,0 +1,143 @@ +; RUN: llc < %s -mtriple=x86_64-pc-win32-coreclr | FileCheck %s -check-prefix=WIN_X64 +; RUN: llc < %s -mtriple=x86_64-pc-linux | FileCheck %s -check-prefix=LINUX + +; By default, windows CoreCLR requires an inline prologue stack expansion check +; if more than 4096 bytes are allocated on the stack. + +; Prolog stack allocation >= 4096 bytes will require the probe sequence +define i32 @main4k() nounwind { +entry: +; WIN_X64-LABEL:main4k: +; WIN_X64: # BB#0: +; WIN_X64: movl $4096, %eax +; WIN_X64: movq %rcx, 8(%rsp) +; WIN_X64: movq %rdx, 16(%rsp) +; WIN_X64: xorq %rcx, %rcx +; WIN_X64: movq %rsp, %rdx +; WIN_X64: subq %rax, %rdx +; WIN_X64: cmovbq %rcx, %rdx +; WIN_X64: movq %gs:16, %rcx +; WIN_X64: cmpq %rcx, %rdx +; WIN_X64: jae .LBB0_3 +; WIN_X64:# BB#1: +; WIN_X64: andq $-4096, %rdx +; WIN_X64:.LBB0_2: +; WIN_X64: leaq -4096(%rcx), %rcx +; WIN_X64: movb $0, (%rcx) +; WIN_X64: cmpq %rcx, %rdx +; WIN_X64: jne .LBB0_2 +; WIN_X64:.LBB0_3: +; WIN_X64: movq 8(%rsp), %rcx +; WIN_X64: movq 16(%rsp), %rdx +; WIN_X64: subq %rax, %rsp +; WIN_X64: xorl %eax, %eax +; WIN_X64: addq $4096, %rsp +; WIN_X64: retq +; LINUX-LABEL:main4k: +; LINUX-NOT: movq %gs:16, %rcx +; LINUX: retq + %a = alloca [4096 x i8] + ret i32 0 +} + +; Prolog stack allocation >= 4096 bytes will require the probe sequence +; Case with frame pointer +define i32 @main4k_frame() nounwind "no-frame-pointer-elim"="true" { +entry: +; WIN_X64-LABEL:main4k_frame: +; WIN_X64: movq %rcx, 16(%rsp) +; WIN_X64: movq %gs:16, %rcx +; LINUX-LABEL:main4k_frame: +; LINUX-NOT: movq %gs:16, %rcx +; LINUX: retq + %a = alloca [4096 x i8] + ret i32 0 +} + +; Prolog stack allocation >= 4096 bytes will require the probe sequence +; Case with INT args +define i32 @main4k_intargs(i32 %x, i32 %y) nounwind { +entry: +; WIN_X64: movq %rcx, 8(%rsp) +; WIN_X64: movq %gs:16, %rcx +; LINUX-NOT: movq %gs:16, %rcx +; LINUX: retq + %a = alloca [4096 x i8] + %t = add i32 %x, %y + ret i32 %t +} + +; Prolog stack allocation >= 4096 bytes will require the probe sequence +; Case with FP regs +define i32 @main4k_fpargs(double %x, double %y) nounwind { +entry: +; WIN_X64: movq %rcx, 8(%rsp) +; WIN_X64: movq %gs:16, %rcx +; LINUX-NOT: movq %gs:16, %rcx +; LINUX: retq + %a = alloca [4096 x i8] + ret i32 0 +} + +; Prolog stack allocation >= 4096 bytes will require the probe sequence +; Case with mixed regs +define i32 @main4k_mixargs(double %x, i32 %y) nounwind { +entry: +; WIN_X64: movq %gs:16, %rcx +; LINUX-NOT: movq %gs:16, %rcx +; LINUX: retq + %a = alloca [4096 x i8] + ret i32 %y +} + +; Make sure we don't emit the probe for a smaller prolog stack allocation. +define i32 @main128() nounwind { +entry: +; WIN_X64-NOT: movq %gs:16, %rcx +; WIN_X64: retq +; LINUX-NOT: movq %gs:16, %rcx +; LINUX: retq + %a = alloca [128 x i8] + ret i32 0 +} + +; Make sure we don't emit the probe sequence if not on windows even if the +; caller has the Win64 calling convention. +define x86_64_win64cc i32 @main4k_win64() nounwind { +entry: +; WIN_X64: movq %gs:16, %rcx +; LINUX-NOT: movq %gs:16, %rcx +; LINUX: retq + %a = alloca [4096 x i8] + ret i32 0 +} + +declare i32 @bar(i8*) nounwind + +; Within-body inline probe expansion +define x86_64_win64cc i32 @main4k_alloca(i64 %n) nounwind { +entry: +; WIN_X64: callq bar +; WIN_X64: movq %gs:16, [[R:%r.*]] +; WIN_X64: callq bar +; LINUX: callq bar +; LINUX-NOT: movq %gs:16, [[R:%r.*]] +; LINUX: callq bar + %a = alloca i8, i64 1024 + %ra = call i32 @bar(i8* %a) nounwind + %b = alloca i8, i64 %n + %rb = call i32 @bar(i8* %b) nounwind + %r = add i32 %ra, %rb + ret i32 %r +} + +; Influence of stack-probe-size attribute +; Note this is not exposed in coreclr +define i32 @test_probe_size() "stack-probe-size"="8192" nounwind { +; WIN_X64-NOT: movq %gs:16, %rcx +; WIN_X64: retq +; LINUX-NOT: movq %gs:16, %rcx +; LINUX: retq + %a = alloca [4096 x i8] + ret i32 0 +}