Skip to content

Commit 809cbe9

Browse files
committedNov 10, 2015
Support for emitting inline stack probes
For CoreCLR on Windows, stack probes must be emitted as inline sequences that probe successive stack pages between the current stack limit and the desired new stack pointer location. This implements support for the inline expansion on x64. For in-body alloca probes, expansion is done during instruction lowering. For prolog probes, a stub call is initially emitted during prolog creation, and expanded after epilog generation, to avoid complications that arise when introducing new machine basic blocks during prolog and epilog creation. Added a new test case, modified an existing one to exclude non-x64 coreclr (for now). Add test case Fix tests llvm-svn: 252578
1 parent 88ae650 commit 809cbe9

File tree

7 files changed

+470
-31
lines changed

7 files changed

+470
-31
lines changed
 

‎llvm/include/llvm/Target/TargetFrameLowering.h

+4
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,10 @@ class TargetFrameLowering {
158158
virtual void emitEpilogue(MachineFunction &MF,
159159
MachineBasicBlock &MBB) const = 0;
160160

161+
/// Replace a StackProbe stub (if any) with the actual probe code inline
162+
virtual void inlineStackProbe(MachineFunction &MF,
163+
MachineBasicBlock &PrologueMBB) const {}
164+
161165
/// Adjust the prologue to have the function use segmented stacks. This works
162166
/// by adding a check even before the "normal" function prologue.
163167
virtual void adjustForSegmentedStacks(MachineFunction &MF,

‎llvm/lib/CodeGen/PrologEpilogInserter.cpp

+3
Original file line numberDiff line numberDiff line change
@@ -781,6 +781,9 @@ void PEI::insertPrologEpilogCode(MachineFunction &Fn) {
781781
for (MachineBasicBlock *RestoreBlock : RestoreBlocks)
782782
TFI.emitEpilogue(Fn, *RestoreBlock);
783783

784+
for (MachineBasicBlock *SaveBlock : SaveBlocks)
785+
TFI.inlineStackProbe(Fn, *SaveBlock);
786+
784787
// Emit additional code that is required to support segmented stacks, if
785788
// we've been asked for it. This, when linked with a runtime with support
786789
// for segmented stacks (libgcc is one), will result in allocating stack

‎llvm/lib/Target/X86/X86FrameLowering.cpp

+278-18
Original file line numberDiff line numberDiff line change
@@ -431,10 +431,257 @@ static bool usesTheStack(const MachineFunction &MF) {
431431
return false;
432432
}
433433

434-
void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
435-
MachineBasicBlock &MBB,
436-
MachineBasicBlock::iterator MBBI,
437-
DebugLoc DL) const {
434+
MachineInstr *X86FrameLowering::emitStackProbe(MachineFunction &MF,
435+
MachineBasicBlock &MBB,
436+
MachineBasicBlock::iterator MBBI,
437+
DebugLoc DL,
438+
bool InProlog) const {
439+
const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
440+
if (STI.isTargetWindowsCoreCLR()) {
441+
if (InProlog) {
442+
return emitStackProbeInlineStub(MF, MBB, MBBI, DL, true);
443+
} else {
444+
return emitStackProbeInline(MF, MBB, MBBI, DL, false);
445+
}
446+
} else {
447+
return emitStackProbeCall(MF, MBB, MBBI, DL, InProlog);
448+
}
449+
}
450+
451+
void X86FrameLowering::inlineStackProbe(MachineFunction &MF,
452+
MachineBasicBlock &PrologMBB) const {
453+
const StringRef ChkStkStubSymbol = "__chkstk_stub";
454+
MachineInstr *ChkStkStub = nullptr;
455+
456+
for (MachineInstr &MI : PrologMBB) {
457+
if (MI.isCall() && MI.getOperand(0).isSymbol() &&
458+
ChkStkStubSymbol == MI.getOperand(0).getSymbolName()) {
459+
ChkStkStub = &MI;
460+
break;
461+
}
462+
}
463+
464+
if (ChkStkStub != nullptr) {
465+
MachineBasicBlock::iterator MBBI = std::next(ChkStkStub->getIterator());
466+
assert(std::prev(MBBI).operator==(ChkStkStub) &&
467+
"MBBI expected after __chkstk_stub.");
468+
DebugLoc DL = PrologMBB.findDebugLoc(MBBI);
469+
emitStackProbeInline(MF, PrologMBB, MBBI, DL, true);
470+
ChkStkStub->eraseFromParent();
471+
}
472+
}
473+
474+
MachineInstr *X86FrameLowering::emitStackProbeInline(
475+
MachineFunction &MF, MachineBasicBlock &MBB,
476+
MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const {
477+
const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
478+
assert(STI.is64Bit() && "different expansion needed for 32 bit");
479+
assert(STI.isTargetWindowsCoreCLR() && "custom expansion expects CoreCLR");
480+
const TargetInstrInfo &TII = *STI.getInstrInfo();
481+
const BasicBlock *LLVM_BB = MBB.getBasicBlock();
482+
483+
// RAX contains the number of bytes of desired stack adjustment.
484+
// The handling here assumes this value has already been updated so as to
485+
// maintain stack alignment.
486+
//
487+
// We need to exit with RSP modified by this amount and execute suitable
488+
// page touches to notify the OS that we're growing the stack responsibly.
489+
// All stack probing must be done without modifying RSP.
490+
//
491+
// MBB:
492+
// SizeReg = RAX;
493+
// ZeroReg = 0
494+
// CopyReg = RSP
495+
// Flags, TestReg = CopyReg - SizeReg
496+
// FinalReg = !Flags.Ovf ? TestReg : ZeroReg
497+
// LimitReg = gs magic thread env access
498+
// if FinalReg >= LimitReg goto ContinueMBB
499+
// RoundBB:
500+
// RoundReg = page address of FinalReg
501+
// LoopMBB:
502+
// LoopReg = PHI(LimitReg,ProbeReg)
503+
// ProbeReg = LoopReg - PageSize
504+
// [ProbeReg] = 0
505+
// if (ProbeReg > RoundReg) goto LoopMBB
506+
// ContinueMBB:
507+
// RSP = RSP - RAX
508+
// [rest of original MBB]
509+
510+
// Set up the new basic blocks
511+
MachineBasicBlock *RoundMBB = MF.CreateMachineBasicBlock(LLVM_BB);
512+
MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB);
513+
MachineBasicBlock *ContinueMBB = MF.CreateMachineBasicBlock(LLVM_BB);
514+
515+
MachineFunction::iterator MBBIter = std::next(MBB.getIterator());
516+
MF.insert(MBBIter, RoundMBB);
517+
MF.insert(MBBIter, LoopMBB);
518+
MF.insert(MBBIter, ContinueMBB);
519+
520+
// Split MBB and move the tail portion down to ContinueMBB.
521+
MachineBasicBlock::iterator BeforeMBBI = std::prev(MBBI);
522+
ContinueMBB->splice(ContinueMBB->begin(), &MBB, MBBI, MBB.end());
523+
ContinueMBB->transferSuccessorsAndUpdatePHIs(&MBB);
524+
525+
// Some useful constants
526+
const int64_t ThreadEnvironmentStackLimit = 0x10;
527+
const int64_t PageSize = 0x1000;
528+
const int64_t PageMask = ~(PageSize - 1);
529+
530+
// Registers we need. For the normal case we use virtual
531+
// registers. For the prolog expansion we use RAX, RCX and RDX.
532+
MachineRegisterInfo &MRI = MF.getRegInfo();
533+
const TargetRegisterClass *RegClass = &X86::GR64RegClass;
534+
const unsigned
535+
SizeReg = InProlog ? X86::RAX : MRI.createVirtualRegister(RegClass),
536+
ZeroReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass),
537+
CopyReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass),
538+
TestReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass),
539+
FinalReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass),
540+
RoundedReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass),
541+
LimitReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass),
542+
JoinReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass),
543+
ProbeReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass);
544+
545+
// SP-relative offsets where we can save RCX and RDX.
546+
int64_t RCXShadowSlot = 0;
547+
int64_t RDXShadowSlot = 0;
548+
549+
// If inlining in the prolog, save RCX and RDX.
550+
// Future optimization: don't save or restore if not live in.
551+
if (InProlog) {
552+
// Compute the offsets. We need to account for things already
553+
// pushed onto the stack at this point: return address, frame
554+
// pointer (if used), and callee saves.
555+
X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
556+
const int64_t CalleeSaveSize = X86FI->getCalleeSavedFrameSize();
557+
const bool HasFP = hasFP(MF);
558+
RCXShadowSlot = 8 + CalleeSaveSize + (HasFP ? 8 : 0);
559+
RDXShadowSlot = RCXShadowSlot + 8;
560+
// Emit the saves.
561+
addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false,
562+
RCXShadowSlot)
563+
.addReg(X86::RCX);
564+
addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false,
565+
RDXShadowSlot)
566+
.addReg(X86::RDX);
567+
} else {
568+
// Not in the prolog. Copy RAX to a virtual reg.
569+
BuildMI(&MBB, DL, TII.get(X86::MOV64rr), SizeReg).addReg(X86::RAX);
570+
}
571+
572+
// Add code to MBB to check for overflow and set the new target stack pointer
573+
// to zero if so.
574+
BuildMI(&MBB, DL, TII.get(X86::XOR64rr), ZeroReg)
575+
.addReg(ZeroReg, RegState::Undef)
576+
.addReg(ZeroReg, RegState::Undef);
577+
BuildMI(&MBB, DL, TII.get(X86::MOV64rr), CopyReg).addReg(X86::RSP);
578+
BuildMI(&MBB, DL, TII.get(X86::SUB64rr), TestReg)
579+
.addReg(CopyReg)
580+
.addReg(SizeReg);
581+
BuildMI(&MBB, DL, TII.get(X86::CMOVB64rr), FinalReg)
582+
.addReg(TestReg)
583+
.addReg(ZeroReg);
584+
585+
// FinalReg now holds final stack pointer value, or zero if
586+
// allocation would overflow. Compare against the current stack
587+
// limit from the thread environment block. Note this limit is the
588+
// lowest touched page on the stack, not the point at which the OS
589+
// will cause an overflow exception, so this is just an optimization
590+
// to avoid unnecessarily touching pages that are below the current
591+
// SP but already commited to the stack by the OS.
592+
BuildMI(&MBB, DL, TII.get(X86::MOV64rm), LimitReg)
593+
.addReg(0)
594+
.addImm(1)
595+
.addReg(0)
596+
.addImm(ThreadEnvironmentStackLimit)
597+
.addReg(X86::GS);
598+
BuildMI(&MBB, DL, TII.get(X86::CMP64rr)).addReg(FinalReg).addReg(LimitReg);
599+
// Jump if the desired stack pointer is at or above the stack limit.
600+
BuildMI(&MBB, DL, TII.get(X86::JAE_1)).addMBB(ContinueMBB);
601+
602+
// Add code to roundMBB to round the final stack pointer to a page boundary.
603+
BuildMI(RoundMBB, DL, TII.get(X86::AND64ri32), RoundedReg)
604+
.addReg(FinalReg)
605+
.addImm(PageMask);
606+
BuildMI(RoundMBB, DL, TII.get(X86::JMP_1)).addMBB(LoopMBB);
607+
608+
// LimitReg now holds the current stack limit, RoundedReg page-rounded
609+
// final RSP value. Add code to loopMBB to decrement LimitReg page-by-page
610+
// and probe until we reach RoundedReg.
611+
if (!InProlog) {
612+
BuildMI(LoopMBB, DL, TII.get(X86::PHI), JoinReg)
613+
.addReg(LimitReg)
614+
.addMBB(RoundMBB)
615+
.addReg(ProbeReg)
616+
.addMBB(LoopMBB);
617+
}
618+
619+
addRegOffset(BuildMI(LoopMBB, DL, TII.get(X86::LEA64r), ProbeReg), JoinReg,
620+
false, -PageSize);
621+
622+
// Probe by storing a byte onto the stack.
623+
BuildMI(LoopMBB, DL, TII.get(X86::MOV8mi))
624+
.addReg(ProbeReg)
625+
.addImm(1)
626+
.addReg(0)
627+
.addImm(0)
628+
.addReg(0)
629+
.addImm(0);
630+
BuildMI(LoopMBB, DL, TII.get(X86::CMP64rr))
631+
.addReg(RoundedReg)
632+
.addReg(ProbeReg);
633+
BuildMI(LoopMBB, DL, TII.get(X86::JNE_1)).addMBB(LoopMBB);
634+
635+
MachineBasicBlock::iterator ContinueMBBI = ContinueMBB->getFirstNonPHI();
636+
637+
// If in prolog, restore RDX and RCX.
638+
if (InProlog) {
639+
addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm),
640+
X86::RCX),
641+
X86::RSP, false, RCXShadowSlot);
642+
addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm),
643+
X86::RDX),
644+
X86::RSP, false, RDXShadowSlot);
645+
}
646+
647+
// Now that the probing is done, add code to continueMBB to update
648+
// the stack pointer for real.
649+
BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::SUB64rr), X86::RSP)
650+
.addReg(X86::RSP)
651+
.addReg(SizeReg);
652+
653+
// Add the control flow edges we need.
654+
MBB.addSuccessor(ContinueMBB);
655+
MBB.addSuccessor(RoundMBB);
656+
RoundMBB->addSuccessor(LoopMBB);
657+
LoopMBB->addSuccessor(ContinueMBB);
658+
LoopMBB->addSuccessor(LoopMBB);
659+
660+
// Mark all the instructions added to the prolog as frame setup.
661+
if (InProlog) {
662+
for (++BeforeMBBI; BeforeMBBI != MBB.end(); ++BeforeMBBI) {
663+
BeforeMBBI->setFlag(MachineInstr::FrameSetup);
664+
}
665+
for (MachineInstr &MI : *RoundMBB) {
666+
MI.setFlag(MachineInstr::FrameSetup);
667+
}
668+
for (MachineInstr &MI : *LoopMBB) {
669+
MI.setFlag(MachineInstr::FrameSetup);
670+
}
671+
for (MachineBasicBlock::iterator CMBBI = ContinueMBB->begin();
672+
CMBBI != ContinueMBBI; ++CMBBI) {
673+
CMBBI->setFlag(MachineInstr::FrameSetup);
674+
}
675+
}
676+
677+
// Possible TODO: physreg liveness for InProlog case.
678+
679+
return ContinueMBBI;
680+
}
681+
682+
MachineInstr *X86FrameLowering::emitStackProbeCall(
683+
MachineFunction &MF, MachineBasicBlock &MBB,
684+
MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const {
438685
bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large;
439686

440687
unsigned CallOp;
@@ -456,6 +703,7 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
456703
Symbol = "_chkstk";
457704

458705
MachineInstrBuilder CI;
706+
MachineBasicBlock::iterator ExpansionMBBI = std::prev(MBBI);
459707

460708
// All current stack probes take AX and SP as input, clobber flags, and
461709
// preserve all registers. x86_64 probes leave RSP unmodified.
@@ -485,6 +733,26 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
485733
.addReg(X86::RSP)
486734
.addReg(X86::RAX);
487735
}
736+
737+
if (InProlog) {
738+
// Apply the frame setup flag to all inserted instrs.
739+
for (++ExpansionMBBI; ExpansionMBBI != MBBI; ++ExpansionMBBI)
740+
ExpansionMBBI->setFlag(MachineInstr::FrameSetup);
741+
}
742+
743+
return MBBI;
744+
}
745+
746+
MachineInstr *X86FrameLowering::emitStackProbeInlineStub(
747+
MachineFunction &MF, MachineBasicBlock &MBB,
748+
MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const {
749+
750+
assert(InProlog && "ChkStkStub called outside prolog!");
751+
752+
MachineInstrBuilder CI = BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32))
753+
.addExternalSymbol("__chkstk_stub");
754+
755+
return MBBI;
488756
}
489757

490758
static unsigned calculateSetFPREG(uint64_t SPAdjust) {
@@ -893,26 +1161,18 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
8931161
// Allocate NumBytes-4 bytes on stack in case of isEAXAlive.
8941162
// We'll also use 4 already allocated bytes for EAX.
8951163
BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
896-
.addImm(isEAXAlive ? NumBytes - 4 : NumBytes)
897-
.setMIFlag(MachineInstr::FrameSetup);
1164+
.addImm(isEAXAlive ? NumBytes - 4 : NumBytes)
1165+
.setMIFlag(MachineInstr::FrameSetup);
8981166
}
8991167

900-
// Save a pointer to the MI where we set AX.
901-
MachineBasicBlock::iterator SetRAX = MBBI;
902-
--SetRAX;
903-
9041168
// Call __chkstk, __chkstk_ms, or __alloca.
905-
emitStackProbeCall(MF, MBB, MBBI, DL);
906-
907-
// Apply the frame setup flag to all inserted instrs.
908-
for (; SetRAX != MBBI; ++SetRAX)
909-
SetRAX->setFlag(MachineInstr::FrameSetup);
1169+
emitStackProbe(MF, MBB, MBBI, DL, true);
9101170

9111171
if (isEAXAlive) {
9121172
// Restore EAX
913-
MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm),
914-
X86::EAX),
915-
StackPtr, false, NumBytes - 4);
1173+
MachineInstr *MI =
1174+
addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), X86::EAX),
1175+
StackPtr, false, NumBytes - 4);
9161176
MI->setFlag(MachineInstr::FrameSetup);
9171177
MBB.insert(MBBI, MI);
9181178
}

0 commit comments

Comments
 (0)
Please sign in to comment.