Diff 295829

llvm/lib/Target/X86/X86FrameLowering.h

Show First 20 Lines • Show All 207 Lines • ▼ Show 20 Lines	void emitStackProbeInlineWindowsCoreCLR64(MachineFunction &MF,
bool InProlog) const;		bool InProlog) const;
void emitStackProbeInlineGeneric(MachineFunction &MF, MachineBasicBlock &MBB,		void emitStackProbeInlineGeneric(MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,		MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, bool InProlog) const;		const DebugLoc &DL, bool InProlog) const;

void emitStackProbeInlineGenericBlock(MachineFunction &MF,		void emitStackProbeInlineGenericBlock(MachineFunction &MF,
MachineBasicBlock &MBB,		MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,		MachineBasicBlock::iterator MBBI,
const DebugLoc &DL,		const DebugLoc &DL, uint64_t Offset,
uint64_t Offset) const;		uint64_t Align) const;

void emitStackProbeInlineGenericLoop(MachineFunction &MF,		void emitStackProbeInlineGenericLoop(MachineFunction &MF,
MachineBasicBlock &MBB,		MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,		MachineBasicBlock::iterator MBBI,
const DebugLoc &DL,		const DebugLoc &DL, uint64_t Offset,
uint64_t Offset) const;		uint64_t Align) const;

/// Emit a stub to later inline the target stack probe.		/// Emit a stub to later inline the target stack probe.
MachineInstr *emitStackProbeInlineStub(MachineFunction &MF,		MachineInstr *emitStackProbeInlineStub(MachineFunction &MF,
MachineBasicBlock &MBB,		MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,		MachineBasicBlock::iterator MBBI,
const DebugLoc &DL,		const DebugLoc &DL,
bool InProlog) const;		bool InProlog) const;

Show All 29 Lines

llvm/lib/Target/X86/X86FrameLowering.cpp

Show First 20 Lines • Show All 580 Lines • ▼ Show 20 Lines	void X86FrameLowering::emitStackProbeInlineGeneric(
const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();		const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
const X86TargetLowering &TLI = *STI.getTargetLowering();		const X86TargetLowering &TLI = *STI.getTargetLowering();
assert(!(STI.is64Bit() && STI.isTargetWindowsCoreCLR()) &&		assert(!(STI.is64Bit() && STI.isTargetWindowsCoreCLR()) &&
"different expansion expected for CoreCLR 64 bit");		"different expansion expected for CoreCLR 64 bit");

const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);		const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
uint64_t ProbeChunk = StackProbeSize * 8;		uint64_t ProbeChunk = StackProbeSize * 8;

		uint64_t MaxAlign =
		TRI->needsStackRealignment(MF) ? calculateMaxStackAlign(MF) : 0;

// Synthesize a loop or unroll it, depending on the number of iterations.		// Synthesize a loop or unroll it, depending on the number of iterations.
		// BuildStackAlignAND ensures that only MaxAlign % StackProbeSize bits left
		// between the unaligned rsp and current rsp.
if (Offset > ProbeChunk) {		if (Offset > ProbeChunk) {
emitStackProbeInlineGenericLoop(MF, MBB, MBBI, DL, Offset);		emitStackProbeInlineGenericLoop(MF, MBB, MBBI, DL, Offset,
		MaxAlign % StackProbeSize);
} else {		} else {
emitStackProbeInlineGenericBlock(MF, MBB, MBBI, DL, Offset);		emitStackProbeInlineGenericBlock(MF, MBB, MBBI, DL, Offset,
		MaxAlign % StackProbeSize);
}		}
}		}

void X86FrameLowering::emitStackProbeInlineGenericBlock(		void X86FrameLowering::emitStackProbeInlineGenericBlock(
MachineFunction &MF, MachineBasicBlock &MBB,		MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI, const DebugLoc &DL,		MachineBasicBlock::iterator MBBI, const DebugLoc &DL, uint64_t Offset,
uint64_t Offset) const {		uint64_t AlignOffset) const {

const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();		const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
const X86TargetLowering &TLI = *STI.getTargetLowering();		const X86TargetLowering &TLI = *STI.getTargetLowering();
const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset);		const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset);
const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;		const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);		const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);

uint64_t CurrentOffset = 0;		uint64_t CurrentOffset = 0;
// 0 Thanks to return address being saved on the stack
uint64_t CurrentProbeOffset = 0;

// For the first N - 1 pages, just probe. I tried to take advantage of		assert(AlignOffset < StackProbeSize);

		// If the offset is so small it fits within a page, there's nothing to do.
		if (StackProbeSize < Offset + AlignOffset) {

		MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
		.addReg(StackPtr)
		.addImm(StackProbeSize - AlignOffset)
		.setMIFlag(MachineInstr::FrameSetup);
		MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.

		addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc))
		.setMIFlag(MachineInstr::FrameSetup),
		StackPtr, false, 0)
		.addImm(0)
		.setMIFlag(MachineInstr::FrameSetup);
		NumFrameExtraProbe++;
		CurrentOffset = StackProbeSize - AlignOffset;
		}

		// For the next N - 1 pages, just probe. I tried to take advantage of
// natural probes but it implies much more logic and there was very few		// natural probes but it implies much more logic and there was very few
// interesting natural probes to interleave.		// interesting natural probes to interleave.
while (CurrentOffset + StackProbeSize < Offset) {		while (CurrentOffset + StackProbeSize < Offset) {
MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)		MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
.addReg(StackPtr)		.addReg(StackPtr)
.addImm(StackProbeSize)		.addImm(StackProbeSize)
.setMIFlag(MachineInstr::FrameSetup);		.setMIFlag(MachineInstr::FrameSetup);
MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.		MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.


addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc))		addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc))
.setMIFlag(MachineInstr::FrameSetup),		.setMIFlag(MachineInstr::FrameSetup),
StackPtr, false, 0)		StackPtr, false, 0)
.addImm(0)		.addImm(0)
.setMIFlag(MachineInstr::FrameSetup);		.setMIFlag(MachineInstr::FrameSetup);
NumFrameExtraProbe++;		NumFrameExtraProbe++;
CurrentOffset += StackProbeSize;		CurrentOffset += StackProbeSize;
CurrentProbeOffset += StackProbeSize;
}		}

		// No need to probe the tail, it is smaller than a Page.
uint64_t ChunkSize = Offset - CurrentOffset;		uint64_t ChunkSize = Offset - CurrentOffset;
MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)		MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
.addReg(StackPtr)		.addReg(StackPtr)
.addImm(ChunkSize)		.addImm(ChunkSize)
.setMIFlag(MachineInstr::FrameSetup);		.setMIFlag(MachineInstr::FrameSetup);
MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.		MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
}		}

void X86FrameLowering::emitStackProbeInlineGenericLoop(		void X86FrameLowering::emitStackProbeInlineGenericLoop(
MachineFunction &MF, MachineBasicBlock &MBB,		MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI, const DebugLoc &DL,		MachineBasicBlock::iterator MBBI, const DebugLoc &DL, uint64_t Offset,
uint64_t Offset) const {		uint64_t AlignOffset) const {
assert(Offset && "null offset");		assert(Offset && "null offset");

const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();		const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
const X86TargetLowering &TLI = *STI.getTargetLowering();		const X86TargetLowering &TLI = *STI.getTargetLowering();
const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;		const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);		const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);

		if (AlignOffset) {
		lkailUnsubmitted Not Done Reply Inline Actions When `Align > StackProbeSize`, which should be rare, is there any chance that the first probe is performed on the old frame? lkail: When `Align > StackProbeSize`, which should be rare, is there any chance that the first probe…
		serge-sans-pailleAuthorUnsubmitted Done Reply Inline Actions That's a concern I have. In that particular code that's ok, because we check `Align <= StackProbeSize` and we just adjust the first probe. In the other case what we're basically doing is rsp += rem(rsp, align) rsp -= align rsp += StackProbeSize rsp = 0 if rsp is already aligned, we end-up doing `align/StackProbeSize` useless probing serge-sans-paille:* That's a concern I have. In that particular code that's ok, because we check `Align <=…
		if (AlignOffset < StackProbeSize) {
		// Perform a first smaller allocation followed by a probe.
		const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, AlignOffset);
		MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(SUBOpc), StackPtr)
		.addReg(StackPtr)
		.addImm(AlignOffset)
		.setMIFlag(MachineInstr::FrameSetup);
		MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.

		addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc))
		.setMIFlag(MachineInstr::FrameSetup),
		StackPtr, false, 0)
		.addImm(0)
		.setMIFlag(MachineInstr::FrameSetup);
		NumFrameExtraProbe++;
		Offset -= AlignOffset;
		}
		}

// Synthesize a loop		// Synthesize a loop
NumFrameLoopProbe++;		NumFrameLoopProbe++;
const BasicBlock *LLVM_BB = MBB.getBasicBlock();		const BasicBlock *LLVM_BB = MBB.getBasicBlock();

MachineBasicBlock *testMBB = MF.CreateMachineBasicBlock(LLVM_BB);		MachineBasicBlock *testMBB = MF.CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *tailMBB = MF.CreateMachineBasicBlock(LLVM_BB);		MachineBasicBlock *tailMBB = MF.CreateMachineBasicBlock(LLVM_BB);

MachineFunction::iterator MBBIter = ++MBB.getIterator();		MachineFunction::iterator MBBIter = ++MBB.getIterator();
MF.insert(MBBIter, testMBB);		MF.insert(MBBIter, testMBB);
MF.insert(MBBIter, tailMBB);		MF.insert(MBBIter, tailMBB);

Register FinalStackProbed = Uses64BitFramePtr ? X86::R11 : X86::R11D;		Register FinalStackProbed = Uses64BitFramePtr ? X86::R11 : X86::R11D;
BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::COPY), FinalStackProbed)		BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::COPY), FinalStackProbed)
.addReg(StackPtr)		.addReg(StackPtr)
.setMIFlag(MachineInstr::FrameSetup);		.setMIFlag(MachineInstr::FrameSetup);

// save loop bound		// save loop bound
{		{
const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset);		const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, Offset);
BuildMI(MBB, MBBI, DL, TII.get(Opc), FinalStackProbed)		BuildMI(MBB, MBBI, DL, TII.get(SUBOpc), FinalStackProbed)
.addReg(FinalStackProbed)		.addReg(FinalStackProbed)
.addImm(Offset / StackProbeSize * StackProbeSize)		.addImm(Offset / StackProbeSize * StackProbeSize)
.setMIFlag(MachineInstr::FrameSetup);		.setMIFlag(MachineInstr::FrameSetup);
}		}

// allocate a page		// allocate a page
{		{
const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, StackProbeSize);		const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, StackProbeSize);
BuildMI(testMBB, DL, TII.get(Opc), StackPtr)		BuildMI(testMBB, DL, TII.get(SUBOpc), StackPtr)
.addReg(StackPtr)		.addReg(StackPtr)
.addImm(StackProbeSize)		.addImm(StackProbeSize)
.setMIFlag(MachineInstr::FrameSetup);		.setMIFlag(MachineInstr::FrameSetup);
}		}

// touch the page		// touch the page
addRegOffset(BuildMI(testMBB, DL, TII.get(MovMIOpc))		addRegOffset(BuildMI(testMBB, DL, TII.get(MovMIOpc))
.setMIFlag(MachineInstr::FrameSetup),		.setMIFlag(MachineInstr::FrameSetup),
▲ Show 20 Lines • Show All 359 Lines • ▼ Show 20 Lines
}		}

void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB,		void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,		MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, unsigned Reg,		const DebugLoc &DL, unsigned Reg,
uint64_t MaxAlign) const {		uint64_t MaxAlign) const {
uint64_t Val = -MaxAlign;		uint64_t Val = -MaxAlign;
unsigned AndOp = getANDriOpcode(Uses64BitFramePtr, Val);		unsigned AndOp = getANDriOpcode(Uses64BitFramePtr, Val);

		MachineFunction &MF = *MBB.getParent();
		const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
		const X86TargetLowering &TLI = *STI.getTargetLowering();
		const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
		const bool EmitInlineStackProbe = TLI.hasInlineStackProbe(MF);

		// We want to make sure that (in worst case) less than StackProbeSize bytes
		efriedmaUnsubmitted Not Done Reply Inline Actions I don't think this condition is right. Say MaxAlign == StackProbeSize. Then an "and" can allocate up to StackProbeSize-4 bytes. So any subsequent stack allocation can jump over a guard page. (This is an extreme example. Really, it doesn't matter what the alignment is; it's just harder to cause a practical issue if the alignment is small.) In general, we can't skip a probe for a stack allocation. We can only merge the probes for adjacent stack allocations. Say, for example, we realign the stack then allocate "Offset" bytes of aligned memory. We can get away with considering both allocations as a single "allocation" if `MaxAlign+Offset <= StackProbeSize`. But that method of proof works if you analyze them together. If you analyze each allocation independently, you can't prove the safety, so the realignment needs its own probe. efriedma: I don't think this condition is right. Say MaxAlign == StackProbeSize. Then an "and" can…
		lkailUnsubmitted Not Done Reply Inline Actions Good catch. Looks PPC64's implementation also has the same issue. I'll post a patch to fix this issue for PPC64. lkail: Good catch. Looks PPC64's implementation also has the same issue. I'll post a patch to fix this…
		// are not probed after the AND. This assumption is used in
		// emitStackProbeInlineGeneric.
		if (Reg == StackPtr && EmitInlineStackProbe && MaxAlign >= StackProbeSize) {
		{
		NumFrameLoopProbe++;
		MachineBasicBlock *entryMBB =
		MF.CreateMachineBasicBlock(MBB.getBasicBlock());
		MachineBasicBlock *headMBB =
		MF.CreateMachineBasicBlock(MBB.getBasicBlock());
		MachineBasicBlock *bodyMBB =
		MF.CreateMachineBasicBlock(MBB.getBasicBlock());
		MachineBasicBlock *footMBB =
		MF.CreateMachineBasicBlock(MBB.getBasicBlock());

		MachineFunction::iterator MBBIter = MBB.getIterator();
		MF.insert(MBBIter, entryMBB);
		MF.insert(MBBIter, headMBB);
		MF.insert(MBBIter, bodyMBB);
		MF.insert(MBBIter, footMBB);
		const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
		Register FinalStackProbed = Uses64BitFramePtr ? X86::R11 : X86::R11D;

		// Setup entry block
		{

		entryMBB->splice(entryMBB->end(), &MBB, MBB.begin(), MBBI);
		BuildMI(entryMBB, DL, TII.get(TargetOpcode::COPY), FinalStackProbed)
		.addReg(StackPtr)
		.setMIFlag(MachineInstr::FrameSetup);
		MachineInstr *MI =
		BuildMI(entryMBB, DL, TII.get(AndOp), FinalStackProbed)
		.addReg(FinalStackProbed)
		.addImm(Val)
		.setMIFlag(MachineInstr::FrameSetup);

		// The EFLAGS implicit def is dead.
		MI->getOperand(3).setIsDead();

		BuildMI(entryMBB, DL,
		TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
		.addReg(FinalStackProbed)
		.addReg(StackPtr)
		.setMIFlag(MachineInstr::FrameSetup);
		BuildMI(entryMBB, DL, TII.get(X86::JCC_1))
		.addMBB(&MBB)
		.addImm(X86::COND_E)
		.setMIFlag(MachineInstr::FrameSetup);
		entryMBB->addSuccessor(headMBB);
		entryMBB->addSuccessor(&MBB);
		}

		// Loop entry block

		{
		const unsigned SUBOpc =
		getSUBriOpcode(Uses64BitFramePtr, StackProbeSize);
		BuildMI(headMBB, DL, TII.get(SUBOpc), StackPtr)
		.addReg(StackPtr)
		.addImm(StackProbeSize)
		.setMIFlag(MachineInstr::FrameSetup);

		BuildMI(headMBB, DL,
		TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
		.addReg(FinalStackProbed)
		.addReg(StackPtr)
		.setMIFlag(MachineInstr::FrameSetup);

		// jump
		BuildMI(headMBB, DL, TII.get(X86::JCC_1))
		.addMBB(footMBB)
		.addImm(X86::COND_B)
		.setMIFlag(MachineInstr::FrameSetup);

		headMBB->addSuccessor(bodyMBB);
		headMBB->addSuccessor(footMBB);
		}

		// setup loop body
		{
		addRegOffset(BuildMI(bodyMBB, DL, TII.get(MovMIOpc))
		.setMIFlag(MachineInstr::FrameSetup),
		StackPtr, false, 0)
		.addImm(0)
		.setMIFlag(MachineInstr::FrameSetup);

		const unsigned SUBOpc =
		getSUBriOpcode(Uses64BitFramePtr, StackProbeSize);
		BuildMI(bodyMBB, DL, TII.get(SUBOpc), StackPtr)
		.addReg(StackPtr)
		.addImm(StackProbeSize)
		.setMIFlag(MachineInstr::FrameSetup);

		// cmp with stack pointer bound
		BuildMI(bodyMBB, DL,
		TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
		.addReg(FinalStackProbed)
		.addReg(StackPtr)
		.setMIFlag(MachineInstr::FrameSetup);

		// jump
		BuildMI(bodyMBB, DL, TII.get(X86::JCC_1))
		.addMBB(bodyMBB)
		.addImm(X86::COND_B)
		.setMIFlag(MachineInstr::FrameSetup);
		bodyMBB->addSuccessor(bodyMBB);
		bodyMBB->addSuccessor(footMBB);
		}

		// setup loop footer
		{
		BuildMI(footMBB, DL, TII.get(TargetOpcode::COPY), StackPtr)
		.addReg(FinalStackProbed)
		.setMIFlag(MachineInstr::FrameSetup);
		addRegOffset(BuildMI(footMBB, DL, TII.get(MovMIOpc))
		.setMIFlag(MachineInstr::FrameSetup),
		StackPtr, false, 0)
		.addImm(0)
		.setMIFlag(MachineInstr::FrameSetup);
		footMBB->addSuccessor(&MBB);
		}

		recomputeLiveIns(*headMBB);
		recomputeLiveIns(*bodyMBB);
		recomputeLiveIns(*footMBB);
		recomputeLiveIns(MBB);
		}
		} else {
MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg)		MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg)
.addReg(Reg)		.addReg(Reg)
.addImm(Val)		.addImm(Val)
.setMIFlag(MachineInstr::FrameSetup);		.setMIFlag(MachineInstr::FrameSetup);

// The EFLAGS implicit def is dead.		// The EFLAGS implicit def is dead.
MI->getOperand(3).setIsDead();		MI->getOperand(3).setIsDead();
}		}
		}

bool X86FrameLowering::has128ByteRedZone(const MachineFunction& MF) const {		bool X86FrameLowering::has128ByteRedZone(const MachineFunction& MF) const {
// x86-64 (non Win64) has a 128 byte red zone which is guaranteed not to be		// x86-64 (non Win64) has a 128 byte red zone which is guaranteed not to be
// clobbered by any interrupt handler.		// clobbered by any interrupt handler.
assert(&STI == &MF.getSubtarget<X86Subtarget>() &&		assert(&STI == &MF.getSubtarget<X86Subtarget>() &&
"MF used frame lowering for wrong subtarget");		"MF used frame lowering for wrong subtarget");
const Function &Fn = MF.getFunction();		const Function &Fn = MF.getFunction();
const bool IsWin64CC = STI.isCallingConvWin64(Fn.getCallingConv());		const bool IsWin64CC = STI.isCallingConvWin64(Fn.getCallingConv());
▲ Show 20 Lines • Show All 2,373 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/stack-clash-large-large-align.ll

This file was added.

				; RUN: llc < %s \| FileCheck %s


				target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
				target triple = "x86_64-unknown-linux-gnu"

				define i32 @foo_noprotect() local_unnamed_addr {
				; CHECK-LABEL: foo_noprotect:
				; CHECK: # %bb.0:
				; CHECK-NEXT: pushq %rbp
				; CHECK-NEXT: .cfi_def_cfa_offset 16
				; CHECK-NEXT: .cfi_offset %rbp, -16
				; CHECK-NEXT: movq %rsp, %rbp
				; CHECK-NEXT: .cfi_def_cfa_register %rbp
				; CHECK-NEXT: andq $-4096, %rsp # imm = 0xF000
				; CHECK-NEXT: subq $73728, %rsp # imm = 0x12000
				; CHECK-NEXT: movl $1, 392(%rsp)
				; CHECK-NEXT: movl $1, 28792(%rsp)
				; CHECK-NEXT: movl (%rsp), %eax
				; CHECK-NEXT: movq %rbp, %rsp
				; CHECK-NEXT: popq %rbp
				; CHECK-NEXT: .cfi_def_cfa %rsp, 8
				; CHECK-NEXT: retq


				%a = alloca i32, i64 18000, align 4096
				%b0 = getelementptr inbounds i32, i32* %a, i64 98
				%b1 = getelementptr inbounds i32, i32* %a, i64 7198
				store volatile i32 1, i32* %b0
				store volatile i32 1, i32* %b1
				%c = load volatile i32, i32* %a
				ret i32 %c
				}

				define i32 @foo_protect() local_unnamed_addr #0 {
				; CHECK-LABEL: foo_protect:
				; CHECK: # %bb.0:
				; CHECK-NEXT: pushq %rbp
				; CHECK-NEXT: .cfi_def_cfa_offset 16
				; CHECK-NEXT: .cfi_offset %rbp, -16
				; CHECK-NEXT: movq %rsp, %rbp
				; CHECK-NEXT: .cfi_def_cfa_register %rbp
				; CHECK-NEXT: movq %rsp, %r11
				; CHECK-NEXT: andq $-4096, %r11 # imm = 0xF000
				; CHECK-NEXT: cmpq %rsp, %r11
				; CHECK-NEXT: je .LBB1_4
				; CHECK-NEXT:# %bb.1:
				; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000
				; CHECK-NEXT: cmpq %rsp, %r11
				; CHECK-NEXT: jb .LBB1_3
				; CHECK-NEXT:.LBB1_2: # =>This Inner Loop Header: Depth=1
				; CHECK-NEXT: movq $0, (%rsp)
				; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000
				; CHECK-NEXT: cmpq %rsp, %r11
				; CHECK-NEXT: jb .LBB1_2
				; CHECK-NEXT:.LBB1_3:
				; CHECK-NEXT: movq %r11, %rsp
				; CHECK-NEXT: movq $0, (%rsp)
				; CHECK-NEXT:.LBB1_4:
				; CHECK-NEXT: movq %rsp, %r11
				; CHECK-NEXT: subq $73728, %r11 # imm = 0x12000
				; CHECK-NEXT:.LBB1_5: # =>This Inner Loop Header: Depth=1
				; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000
				; CHECK-NEXT: movq $0, (%rsp)
				; CHECK-NEXT: cmpq %r11, %rsp
				; CHECK-NEXT: jne .LBB1_5
				; CHECK-NEXT:# %bb.6:
				; CHECK-NEXT: movl $1, 392(%rsp)
				; CHECK-NEXT: movl $1, 28792(%rsp)
				; CHECK-NEXT: movl (%rsp), %eax
				; CHECK-NEXT: movq %rbp, %rsp
				; CHECK-NEXT: popq %rbp
				; CHECK-NEXT: .cfi_def_cfa %rsp, 8
				; CHECK-NEXT: retq




				%a = alloca i32, i64 18000, align 4096
				%b0 = getelementptr inbounds i32, i32* %a, i64 98
				%b1 = getelementptr inbounds i32, i32* %a, i64 7198
				store volatile i32 1, i32* %b0
				store volatile i32 1, i32* %b1
				%c = load volatile i32, i32* %a
				ret i32 %c
				}

				attributes #0 = {"probe-stack"="inline-asm"}

llvm/test/CodeGen/X86/stack-clash-no-free-probe.ll

This file was deleted.

	; RUN: llc < %s \| FileCheck %s

	target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
	target triple = "x86_64-unknown-linux-gnu"

	define i32 @foo(i64 %i) local_unnamed_addr #0 {
	; CHECK-LABEL: foo:
	; CHECK: # %bb.0:
	; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000
	; CHECK-NEXT: movq $0, (%rsp)
	; CHECK-NEXT: subq $3784, %rsp # imm = 0xEC8
	; CHECK-NEXT: .cfi_def_cfa_offset 7888
	; CHECK-NEXT: movl $1, -128(%rsp,%rdi,4)
	; CHECK-NEXT: movl -128(%rsp), %eax
	; CHECK-NEXT: addq $7880, %rsp # imm = 0x1EC8
	; CHECK-NEXT: .cfi_def_cfa_offset 8
	; CHECK-NEXT: retq

	%a = alloca i32, i32 2000, align 16
	%b = getelementptr inbounds i32, i32* %a, i64 %i
	store volatile i32 1, i32* %b
	%c = load volatile i32, i32* %a
	ret i32 %c
	}

	attributes #0 = {"probe-stack"="inline-asm"}

llvm/test/CodeGen/X86/stack-clash-small-alloc-medium-align.ll

This file was added.

				; RUN: llc < %s \| FileCheck %s

				target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
				target triple = "x86_64-unknown-linux-gnu"

				; \| case1 \| alloca + align < probe_size
				define i32 @foo1(i64 %i) local_unnamed_addr #0 {
				; CHECK-LABEL: foo1:
				; CHECK: # %bb.0:
				; CHECK-NEXT: pushq %rbp
				; CHECK-NEXT: .cfi_def_cfa_offset 16
				; CHECK-NEXT: .cfi_offset %rbp, -16
				; CHECK-NEXT: movq %rsp, %rbp
				; CHECK-NEXT: .cfi_def_cfa_register %rbp
				; CHECK-NEXT: andq $-64, %rsp
				; CHECK-NEXT: subq $832, %rsp # imm = 0x340
				; CHECK-NEXT: movl $1, (%rsp,%rdi,4)
				; CHECK-NEXT: movl (%rsp), %eax
				; CHECK-NEXT: movq %rbp, %rsp
				; CHECK-NEXT: popq %rbp
				; CHECK-NEXT: .cfi_def_cfa %rsp, 8
				; CHECK-NEXT: retq

				%a = alloca i32, i32 200, align 64
				%b = getelementptr inbounds i32, i32* %a, i64 %i
				store volatile i32 1, i32* %b
				%c = load volatile i32, i32* %a
				ret i32 %c
				}

				; \| case2 \| alloca > probe_size, align > probe_size
				define i32 @foo2(i64 %i) local_unnamed_addr #0 {
				; CHECK-LABEL: foo2:
				; CHECK: # %bb.0:
				; CHECK-NEXT: pushq %rbp
				; CHECK-NEXT: .cfi_def_cfa_offset 16
				; CHECK-NEXT: .cfi_offset %rbp, -16
				; CHECK-NEXT: movq %rsp, %rbp
				; CHECK-NEXT: .cfi_def_cfa_register %rbp
				; CHECK-NEXT: andq $-2048, %rsp # imm = 0xF800
				; CHECK-NEXT: subq $2048, %rsp # imm = 0x800
				; CHECK-NEXT: movq $0, (%rsp)
				; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000
				; CHECK-NEXT: movq $0, (%rsp)
				; CHECK-NEXT: subq $2048, %rsp # imm = 0x800
				; CHECK-NEXT: movl $1, (%rsp,%rdi,4)
				; CHECK-NEXT: movl (%rsp), %eax
				; CHECK-NEXT: movq %rbp, %rsp
				; CHECK-NEXT: popq %rbp
				; CHECK-NEXT: .cfi_def_cfa %rsp, 8
				; CHECK-NEXT: retq

				%a = alloca i32, i32 2000, align 2048
				%b = getelementptr inbounds i32, i32* %a, i64 %i
				store volatile i32 1, i32* %b
				%c = load volatile i32, i32* %a
				ret i32 %c
				}

				; \| case3 \| alloca < probe_size, align < probe_size, alloca + align > probe_size
				define i32 @foo3(i64 %i) local_unnamed_addr #0 {
				; CHECK-LABEL: foo3:
				; CHECK: # %bb.0:
				; CHECK-NEXT: pushq %rbp
				; CHECK-NEXT: .cfi_def_cfa_offset 16
				; CHECK-NEXT: .cfi_offset %rbp, -16
				; CHECK-NEXT: movq %rsp, %rbp
				; CHECK-NEXT: .cfi_def_cfa_register %rbp
				; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00
				; CHECK-NEXT: subq $3072, %rsp # imm = 0xC00
				; CHECK-NEXT: movq $0, (%rsp)
				; CHECK-NEXT: subq $1024, %rsp # imm = 0x400
				; CHECK-NEXT: movl $1, (%rsp,%rdi,4)
				; CHECK-NEXT: movl (%rsp), %eax
				; CHECK-NEXT: movq %rbp, %rsp
				; CHECK-NEXT: popq %rbp
				; CHECK-NEXT: .cfi_def_cfa %rsp, 8
				; CHECK-NEXT: retq


				%a = alloca i32, i32 1000, align 1024
				%b = getelementptr inbounds i32, i32* %a, i64 %i
				store volatile i32 1, i32* %b
				%c = load volatile i32, i32* %a
				ret i32 %c
				}

				; \| case4 \| alloca + probe_size < probe_size, followed by dynamic alloca
				define i32 @foo4(i64 %i) local_unnamed_addr #0 {
				; CHECK-LABEL: foo4:
				; CHECK: # %bb.0:
				; CHECK-NEXT: pushq %rbp
				; CHECK-NEXT: .cfi_def_cfa_offset 16
				; CHECK-NEXT: .cfi_offset %rbp, -16
				; CHECK-NEXT: movq %rsp, %rbp
				; CHECK-NEXT: .cfi_def_cfa_register %rbp
				; CHECK-NEXT: pushq %rbx
				; CHECK-NEXT: andq $-64, %rsp
				; CHECK-NEXT: subq $896, %rsp # imm = 0x380
				; CHECK-NEXT: movq %rsp, %rbx
				; CHECK-NEXT: .cfi_offset %rbx, -24
				; CHECK-NEXT: movl $1, (%rbx,%rdi,4)
				; CHECK-NEXT: movl (%rbx), %ecx
				; CHECK-NEXT: movq %rsp, %rax
				; CHECK-NEXT: leaq 15(,%rcx,4), %rcx
				; CHECK-NEXT: andq $-16, %rcx
				; CHECK-NEXT: subq %rcx, %rax
				; CHECK-NEXT: cmpq %rsp, %rax
				; CHECK-NEXT: jle .LBB3_3
				; CHECK-NEXT:.LBB3_2: # =>This Inner Loop Header: Depth=1
				; CHECK-NEXT: movq $0, (%rsp)
				; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000
				; CHECK-NEXT: cmpq %rsp, %rax
				; CHECK-NEXT: jg .LBB3_2
				; CHECK-NEXT:.LBB3_3:
				; CHECK-NEXT: andq $-64, %rax
				; CHECK-NEXT: movq %rax, %rsp
				; CHECK-NEXT: movl (%rax), %eax
				; CHECK-NEXT: leaq -8(%rbp), %rsp
				; CHECK-NEXT: popq %rbx
				; CHECK-NEXT: popq %rbp
				; CHECK-NEXT: .cfi_def_cfa %rsp, 8
				; CHECK-NEXT: retq

				%a = alloca i32, i32 200, align 64
				%b = getelementptr inbounds i32, i32* %a, i64 %i
				store volatile i32 1, i32* %b
				%c = load volatile i32, i32* %a
				%d = alloca i32, i32 %c, align 64
				%e = load volatile i32, i32* %d
				ret i32 %e
				}

				attributes #0 = {"probe-stack"="inline-asm"}

llvm/test/CodeGen/X86/stack-clash-small-large-align.ll

This file was added.

				; RUN: llc < %s \| FileCheck %s


				target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
				target triple = "x86_64-unknown-linux-gnu"

				define i32 @foo_noprotect() local_unnamed_addr {
				; CHECK-LABEL: foo_noprotect:
				; CHECK: # %bb.0:
				; CHECK-NEXT: pushq %rbp
				; CHECK-NEXT: .cfi_def_cfa_offset 16
				; CHECK-NEXT: .cfi_offset %rbp, -16
				; CHECK-NEXT: movq %rsp, %rbp
				; CHECK-NEXT: .cfi_def_cfa_register %rbp
				; CHECK-NEXT: andq $-65536, %rsp
				; CHECK-NEXT: subq $65536, %rsp
				; CHECK-NEXT: movl $1, 392(%rsp)
				; CHECK-NEXT: movl (%rsp), %eax
				; CHECK-NEXT: movq %rbp, %rsp
				; CHECK-NEXT: popq %rbp
				; CHECK-NEXT: .cfi_def_cfa %rsp, 8
				; CHECK-NEXT: retq



				%a = alloca i32, i64 100, align 65536
				%b = getelementptr inbounds i32, i32* %a, i64 98
				store volatile i32 1, i32* %b
				%c = load volatile i32, i32* %a
				ret i32 %c
				}

				define i32 @foo_protect() local_unnamed_addr #0 {
				; CHECK-LABEL: foo_protect:
				; CHECK: # %bb.0:
				; CHECK-NEXT: pushq %rbp
				; CHECK-NEXT: .cfi_def_cfa_offset 16
				; CHECK-NEXT: .cfi_offset %rbp, -16
				; CHECK-NEXT: movq %rsp, %rbp
				; CHECK-NEXT: .cfi_def_cfa_register %rbp
				; CHECK-NEXT: movq %rsp, %r11
				; CHECK-NEXT: andq $-65536, %r11 # imm = 0xFFFF0000
				; CHECK-NEXT: cmpq %rsp, %r11
				cuviperUnsubmitted Not Done Reply Inline Actions There's an immediate race after `andq $-65536, %rsp` -- if we get a signal here, the stack pointer could be in a potentially bad place and start clobbering stuff. Then adding the full alignment puts us into arbitrary stack memory, or even could go past the top of the stack. If we start writing probes from there, who knows what memory we're clobbering. (akin to @lkail's concern) Consider if the stack is almost aligned to begin with, something like `0x1230010`. The `and` will only adjust a small distance to `0x1230000`, and then the `add` makes it `0x1240000`. The first probe will be a page below that, `0x123E000`, but that's still way out of our frame, not memory we should be writing. Actually, that plays just as badly if the stack is perfectly aligned to begin with. The other extreme was the original concern, perhaps with an incoming stack like `0x123FFE0`. Then it will again `and` to `0x1230000`, `add` to `0x1240000`, and start probing at `0x123E000`, which is OK in this case. cuviper: There's an immediate race after `andq $-65536, %rsp` -- if we get a signal here, the stack…
				cuviperUnsubmitted Not Done Reply Inline Actions Oops, those start probing at `0x123F000`, sorry for the bad math. The point stands though. cuviper: Oops, those start probing at `0x123F000`, sorry for the bad math. The point stands though.
				; CHECK-NEXT: je .LBB1_4
				; CHECK-NEXT:# %bb.1:
				; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000
				; CHECK-NEXT: cmpq %rsp, %r11
				; CHECK-NEXT: jb .LBB1_3
				; CHECK-NEXT:.LBB1_2: # =>This Inner Loop Header: Depth=1
				; CHECK-NEXT: movq $0, (%rsp)
				; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000
				; CHECK-NEXT: cmpq %rsp, %r11
				; CHECK-NEXT: jb .LBB1_2
				; CHECK-NEXT:.LBB1_3:
				; CHECK-NEXT: movq %r11, %rsp
				; CHECK-NEXT: movq $0, (%rsp)
				; CHECK-NEXT:.LBB1_4:
				; CHECK-NEXT: movq %rsp, %r11
				; CHECK-NEXT: subq $65536, %r11 # imm = 0x10000
				; CHECK-NEXT:.LBB1_5: # =>This Inner Loop Header: Depth=1
				; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000
				; CHECK-NEXT: movq $0, (%rsp)
				; CHECK-NEXT: cmpq %r11, %rsp
				; CHECK-NEXT: jne .LBB1_5
				; CHECK-NEXT:# %bb.6:
				; CHECK-NEXT: movl $1, 392(%rsp)
				; CHECK-NEXT: movl (%rsp), %eax
				; CHECK-NEXT: movq %rbp, %rsp
				; CHECK-NEXT: popq %rbp
				; CHECK-NEXT: .cfi_def_cfa %rsp, 8
				; CHECK-NEXT: retq




				%a = alloca i32, i64 100, align 65536
				%b = getelementptr inbounds i32, i32* %a, i64 98
				store volatile i32 1, i32* %b
				%c = load volatile i32, i32* %a
				ret i32 %c
				}

				attributes #0 = {"probe-stack"="inline-asm"}

This is an archive of the discontinued LLVM Phabricator instance.

Fix interaction between stack alignment and inline-asm stack clash protection
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 295829

llvm/lib/Target/X86/X86FrameLowering.h

llvm/lib/Target/X86/X86FrameLowering.cpp

llvm/test/CodeGen/X86/stack-clash-large-large-align.ll

llvm/test/CodeGen/X86/stack-clash-no-free-probe.ll

llvm/test/CodeGen/X86/stack-clash-small-alloc-medium-align.ll

llvm/test/CodeGen/X86/stack-clash-small-large-align.ll

This is an archive of the discontinued LLVM Phabricator instance.

Fix interaction between stack alignment and inline-asm stack clash protectionClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 295829

llvm/lib/Target/X86/X86FrameLowering.h

llvm/lib/Target/X86/X86FrameLowering.cpp

llvm/test/CodeGen/X86/stack-clash-large-large-align.ll

llvm/test/CodeGen/X86/stack-clash-no-free-probe.ll

llvm/test/CodeGen/X86/stack-clash-small-alloc-medium-align.ll

llvm/test/CodeGen/X86/stack-clash-small-large-align.ll

Fix interaction between stack alignment and inline-asm stack clash protection
ClosedPublic