Index: llvm/lib/Target/X86/X86FrameLowering.h =================================================================== --- llvm/lib/Target/X86/X86FrameLowering.h +++ llvm/lib/Target/X86/X86FrameLowering.h @@ -213,14 +213,14 @@ void emitStackProbeInlineGenericBlock(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, - uint64_t Offset) const; + const DebugLoc &DL, uint64_t Offset, + uint64_t Align) const; void emitStackProbeInlineGenericLoop(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, - uint64_t Offset) const; + const DebugLoc &DL, uint64_t Offset, + uint64_t Align) const; /// Emit a stub to later inline the target stack probe. MachineInstr *emitStackProbeInlineStub(MachineFunction &MF, Index: llvm/lib/Target/X86/X86FrameLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86FrameLowering.cpp +++ llvm/lib/Target/X86/X86FrameLowering.cpp @@ -586,29 +586,55 @@ const uint64_t StackProbeSize = TLI.getStackProbeSize(MF); uint64_t ProbeChunk = StackProbeSize * 8; + uint64_t MaxAlign = + TRI->needsStackRealignment(MF) ? calculateMaxStackAlign(MF) : 0; + // Synthesize a loop or unroll it, depending on the number of iterations. + // BuildStackAlignAND ensures that only MaxAlign % StackProbeSize bits left + // between the unaligned rsp and current rsp. if (Offset > ProbeChunk) { - emitStackProbeInlineGenericLoop(MF, MBB, MBBI, DL, Offset); + emitStackProbeInlineGenericLoop(MF, MBB, MBBI, DL, Offset, + MaxAlign % StackProbeSize); } else { - emitStackProbeInlineGenericBlock(MF, MBB, MBBI, DL, Offset); + emitStackProbeInlineGenericBlock(MF, MBB, MBBI, DL, Offset, + MaxAlign % StackProbeSize); } } void X86FrameLowering::emitStackProbeInlineGenericBlock( MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, const DebugLoc &DL, - uint64_t Offset) const { + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, uint64_t Offset, + uint64_t AlignOffset) const { const X86Subtarget &STI = MF.getSubtarget(); const X86TargetLowering &TLI = *STI.getTargetLowering(); const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset); const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi; const uint64_t StackProbeSize = TLI.getStackProbeSize(MF); + uint64_t CurrentOffset = 0; - // 0 Thanks to return address being saved on the stack - uint64_t CurrentProbeOffset = 0; - // For the first N - 1 pages, just probe. I tried to take advantage of + assert(AlignOffset < StackProbeSize); + + // If the offset is so small it fits within a page, there's nothing to do. + if (StackProbeSize < Offset + AlignOffset) { + + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) + .addReg(StackPtr) + .addImm(StackProbeSize - AlignOffset) + .setMIFlag(MachineInstr::FrameSetup); + MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. + + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc)) + .setMIFlag(MachineInstr::FrameSetup), + StackPtr, false, 0) + .addImm(0) + .setMIFlag(MachineInstr::FrameSetup); + NumFrameExtraProbe++; + CurrentOffset = StackProbeSize - AlignOffset; + } + + // For the next N - 1 pages, just probe. I tried to take advantage of // natural probes but it implies much more logic and there was very few // interesting natural probes to interleave. while (CurrentOffset + StackProbeSize < Offset) { @@ -626,9 +652,9 @@ .setMIFlag(MachineInstr::FrameSetup); NumFrameExtraProbe++; CurrentOffset += StackProbeSize; - CurrentProbeOffset += StackProbeSize; } + // No need to probe the tail, it is smaller than a Page. uint64_t ChunkSize = Offset - CurrentOffset; MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) .addReg(StackPtr) @@ -639,8 +665,8 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop( MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, const DebugLoc &DL, - uint64_t Offset) const { + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, uint64_t Offset, + uint64_t AlignOffset) const { assert(Offset && "null offset"); const X86Subtarget &STI = MF.getSubtarget(); @@ -648,6 +674,26 @@ const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi; const uint64_t StackProbeSize = TLI.getStackProbeSize(MF); + if (AlignOffset) { + if (AlignOffset < StackProbeSize) { + // Perform a first smaller allocation followed by a probe. + const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, AlignOffset); + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(SUBOpc), StackPtr) + .addReg(StackPtr) + .addImm(AlignOffset) + .setMIFlag(MachineInstr::FrameSetup); + MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. + + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc)) + .setMIFlag(MachineInstr::FrameSetup), + StackPtr, false, 0) + .addImm(0) + .setMIFlag(MachineInstr::FrameSetup); + NumFrameExtraProbe++; + Offset -= AlignOffset; + } + } + // Synthesize a loop NumFrameLoopProbe++; const BasicBlock *LLVM_BB = MBB.getBasicBlock(); @@ -666,8 +712,8 @@ // save loop bound { - const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset); - BuildMI(MBB, MBBI, DL, TII.get(Opc), FinalStackProbed) + const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, Offset); + BuildMI(MBB, MBBI, DL, TII.get(SUBOpc), FinalStackProbed) .addReg(FinalStackProbed) .addImm(Offset / StackProbeSize * StackProbeSize) .setMIFlag(MachineInstr::FrameSetup); @@ -675,8 +721,8 @@ // allocate a page { - const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, StackProbeSize); - BuildMI(testMBB, DL, TII.get(Opc), StackPtr) + const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, StackProbeSize); + BuildMI(testMBB, DL, TII.get(SUBOpc), StackPtr) .addReg(StackPtr) .addImm(StackProbeSize) .setMIFlag(MachineInstr::FrameSetup); @@ -1052,13 +1098,149 @@ uint64_t MaxAlign) const { uint64_t Val = -MaxAlign; unsigned AndOp = getANDriOpcode(Uses64BitFramePtr, Val); - MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg) - .addReg(Reg) - .addImm(Val) - .setMIFlag(MachineInstr::FrameSetup); - // The EFLAGS implicit def is dead. - MI->getOperand(3).setIsDead(); + MachineFunction &MF = *MBB.getParent(); + const X86Subtarget &STI = MF.getSubtarget(); + const X86TargetLowering &TLI = *STI.getTargetLowering(); + const uint64_t StackProbeSize = TLI.getStackProbeSize(MF); + const bool EmitInlineStackProbe = TLI.hasInlineStackProbe(MF); + + // We want to make sure that (in worst case) less than StackProbeSize bytes + // are not probed after the AND. This assumption is used in + // emitStackProbeInlineGeneric. + if (Reg == StackPtr && EmitInlineStackProbe && MaxAlign >= StackProbeSize) { + { + NumFrameLoopProbe++; + MachineBasicBlock *entryMBB = + MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MachineBasicBlock *headMBB = + MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MachineBasicBlock *bodyMBB = + MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MachineBasicBlock *footMBB = + MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + + MachineFunction::iterator MBBIter = MBB.getIterator(); + MF.insert(MBBIter, entryMBB); + MF.insert(MBBIter, headMBB); + MF.insert(MBBIter, bodyMBB); + MF.insert(MBBIter, footMBB); + const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi; + Register FinalStackProbed = Uses64BitFramePtr ? X86::R11 : X86::R11D; + + // Setup entry block + { + + entryMBB->splice(entryMBB->end(), &MBB, MBB.begin(), MBBI); + BuildMI(entryMBB, DL, TII.get(TargetOpcode::COPY), FinalStackProbed) + .addReg(StackPtr) + .setMIFlag(MachineInstr::FrameSetup); + MachineInstr *MI = + BuildMI(entryMBB, DL, TII.get(AndOp), FinalStackProbed) + .addReg(FinalStackProbed) + .addImm(Val) + .setMIFlag(MachineInstr::FrameSetup); + + // The EFLAGS implicit def is dead. + MI->getOperand(3).setIsDead(); + + BuildMI(entryMBB, DL, + TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr)) + .addReg(FinalStackProbed) + .addReg(StackPtr) + .setMIFlag(MachineInstr::FrameSetup); + BuildMI(entryMBB, DL, TII.get(X86::JCC_1)) + .addMBB(&MBB) + .addImm(X86::COND_E) + .setMIFlag(MachineInstr::FrameSetup); + entryMBB->addSuccessor(headMBB); + entryMBB->addSuccessor(&MBB); + } + + // Loop entry block + + { + const unsigned SUBOpc = + getSUBriOpcode(Uses64BitFramePtr, StackProbeSize); + BuildMI(headMBB, DL, TII.get(SUBOpc), StackPtr) + .addReg(StackPtr) + .addImm(StackProbeSize) + .setMIFlag(MachineInstr::FrameSetup); + + BuildMI(headMBB, DL, + TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr)) + .addReg(FinalStackProbed) + .addReg(StackPtr) + .setMIFlag(MachineInstr::FrameSetup); + + // jump + BuildMI(headMBB, DL, TII.get(X86::JCC_1)) + .addMBB(footMBB) + .addImm(X86::COND_B) + .setMIFlag(MachineInstr::FrameSetup); + + headMBB->addSuccessor(bodyMBB); + headMBB->addSuccessor(footMBB); + } + + // setup loop body + { + addRegOffset(BuildMI(bodyMBB, DL, TII.get(MovMIOpc)) + .setMIFlag(MachineInstr::FrameSetup), + StackPtr, false, 0) + .addImm(0) + .setMIFlag(MachineInstr::FrameSetup); + + const unsigned SUBOpc = + getSUBriOpcode(Uses64BitFramePtr, StackProbeSize); + BuildMI(bodyMBB, DL, TII.get(SUBOpc), StackPtr) + .addReg(StackPtr) + .addImm(StackProbeSize) + .setMIFlag(MachineInstr::FrameSetup); + + // cmp with stack pointer bound + BuildMI(bodyMBB, DL, + TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr)) + .addReg(FinalStackProbed) + .addReg(StackPtr) + .setMIFlag(MachineInstr::FrameSetup); + + // jump + BuildMI(bodyMBB, DL, TII.get(X86::JCC_1)) + .addMBB(bodyMBB) + .addImm(X86::COND_B) + .setMIFlag(MachineInstr::FrameSetup); + bodyMBB->addSuccessor(bodyMBB); + bodyMBB->addSuccessor(footMBB); + } + + // setup loop footer + { + BuildMI(footMBB, DL, TII.get(TargetOpcode::COPY), StackPtr) + .addReg(FinalStackProbed) + .setMIFlag(MachineInstr::FrameSetup); + addRegOffset(BuildMI(footMBB, DL, TII.get(MovMIOpc)) + .setMIFlag(MachineInstr::FrameSetup), + StackPtr, false, 0) + .addImm(0) + .setMIFlag(MachineInstr::FrameSetup); + footMBB->addSuccessor(&MBB); + } + + recomputeLiveIns(*headMBB); + recomputeLiveIns(*bodyMBB); + recomputeLiveIns(*footMBB); + recomputeLiveIns(MBB); + } + } else { + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg) + .addReg(Reg) + .addImm(Val) + .setMIFlag(MachineInstr::FrameSetup); + + // The EFLAGS implicit def is dead. + MI->getOperand(3).setIsDead(); + } } bool X86FrameLowering::has128ByteRedZone(const MachineFunction& MF) const { Index: llvm/test/CodeGen/X86/stack-clash-large-large-align.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/stack-clash-large-large-align.ll @@ -0,0 +1,88 @@ +; RUN: llc < %s | FileCheck %s + + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @foo_noprotect() local_unnamed_addr { +; CHECK-LABEL: foo_noprotect: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: andq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: subq $73728, %rsp # imm = 0x12000 +; CHECK-NEXT: movl $1, 392(%rsp) +; CHECK-NEXT: movl $1, 28792(%rsp) +; CHECK-NEXT: movl (%rsp), %eax +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: retq + + + %a = alloca i32, i64 18000, align 4096 + %b0 = getelementptr inbounds i32, i32* %a, i64 98 + %b1 = getelementptr inbounds i32, i32* %a, i64 7198 + store volatile i32 1, i32* %b0 + store volatile i32 1, i32* %b1 + %c = load volatile i32, i32* %a + ret i32 %c +} + +define i32 @foo_protect() local_unnamed_addr #0 { +; CHECK-LABEL: foo_protect: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: movq %rsp, %r11 +; CHECK-NEXT: andq $-4096, %r11 # imm = 0xF000 +; CHECK-NEXT: cmpq %rsp, %r11 +; CHECK-NEXT: je .LBB1_4 +; CHECK-NEXT:# %bb.1: +; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-NEXT: cmpq %rsp, %r11 +; CHECK-NEXT: jb .LBB1_3 +; CHECK-NEXT:.LBB1_2: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movq $0, (%rsp) +; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-NEXT: cmpq %rsp, %r11 +; CHECK-NEXT: jb .LBB1_2 +; CHECK-NEXT:.LBB1_3: +; CHECK-NEXT: movq %r11, %rsp +; CHECK-NEXT: movq $0, (%rsp) +; CHECK-NEXT:.LBB1_4: +; CHECK-NEXT: movq %rsp, %r11 +; CHECK-NEXT: subq $73728, %r11 # imm = 0x12000 +; CHECK-NEXT:.LBB1_5: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-NEXT: movq $0, (%rsp) +; CHECK-NEXT: cmpq %r11, %rsp +; CHECK-NEXT: jne .LBB1_5 +; CHECK-NEXT:# %bb.6: +; CHECK-NEXT: movl $1, 392(%rsp) +; CHECK-NEXT: movl $1, 28792(%rsp) +; CHECK-NEXT: movl (%rsp), %eax +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: retq + + + + + %a = alloca i32, i64 18000, align 4096 + %b0 = getelementptr inbounds i32, i32* %a, i64 98 + %b1 = getelementptr inbounds i32, i32* %a, i64 7198 + store volatile i32 1, i32* %b0 + store volatile i32 1, i32* %b1 + %c = load volatile i32, i32* %a + ret i32 %c +} + +attributes #0 = {"probe-stack"="inline-asm"} Index: llvm/test/CodeGen/X86/stack-clash-no-free-probe.ll =================================================================== --- llvm/test/CodeGen/X86/stack-clash-no-free-probe.ll +++ /dev/null @@ -1,27 +0,0 @@ -; RUN: llc < %s | FileCheck %s - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -define i32 @foo(i64 %i) local_unnamed_addr #0 { -; CHECK-LABEL: foo: -; CHECK: # %bb.0: -; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 -; CHECK-NEXT: movq $0, (%rsp) -; CHECK-NEXT: subq $3784, %rsp # imm = 0xEC8 -; CHECK-NEXT: .cfi_def_cfa_offset 7888 -; CHECK-NEXT: movl $1, -128(%rsp,%rdi,4) -; CHECK-NEXT: movl -128(%rsp), %eax -; CHECK-NEXT: addq $7880, %rsp # imm = 0x1EC8 -; CHECK-NEXT: .cfi_def_cfa_offset 8 -; CHECK-NEXT: retq - - %a = alloca i32, i32 2000, align 16 - %b = getelementptr inbounds i32, i32* %a, i64 %i - store volatile i32 1, i32* %b - %c = load volatile i32, i32* %a - ret i32 %c -} - -attributes #0 = {"probe-stack"="inline-asm"} - Index: llvm/test/CodeGen/X86/stack-clash-small-alloc-medium-align.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/stack-clash-small-alloc-medium-align.ll @@ -0,0 +1,34 @@ +; RUN: llc < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @foo(i64 %i) local_unnamed_addr #0 { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: andq $-64, %rsp +; CHECK-NEXT: subq $4032, %rsp # imm = 0xFC0 +; CHECK-NEXT: movq $0, (%rsp) +; CHECK-NEXT: subq $4032, %rsp # imm = 0xFC0 +; CHECK-NEXT: movl $1, (%rsp,%rdi,4) +; CHECK-NEXT: movl (%rsp), %eax +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: retq + + + %a = alloca i32, i32 2000, align 64 + %b = getelementptr inbounds i32, i32* %a, i64 %i + store volatile i32 1, i32* %b + %c = load volatile i32, i32* %a + ret i32 %c +} + +attributes #0 = {"probe-stack"="inline-asm"} + Index: llvm/test/CodeGen/X86/stack-clash-small-large-align.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/stack-clash-small-large-align.ll @@ -0,0 +1,83 @@ +; RUN: llc < %s | FileCheck %s + + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @foo_noprotect() local_unnamed_addr { +; CHECK-LABEL: foo_noprotect: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: andq $-65536, %rsp +; CHECK-NEXT: subq $65536, %rsp +; CHECK-NEXT: movl $1, 392(%rsp) +; CHECK-NEXT: movl (%rsp), %eax +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: retq + + + + %a = alloca i32, i64 100, align 65536 + %b = getelementptr inbounds i32, i32* %a, i64 98 + store volatile i32 1, i32* %b + %c = load volatile i32, i32* %a + ret i32 %c +} + +define i32 @foo_protect() local_unnamed_addr #0 { +; CHECK-LABEL: foo_protect: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: movq %rsp, %r11 +; CHECK-NEXT: andq $-65536, %r11 # imm = 0xFFFF0000 +; CHECK-NEXT: cmpq %rsp, %r11 +; CHECK-NEXT: je .LBB1_4 +; CHECK-NEXT:# %bb.1: +; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-NEXT: cmpq %rsp, %r11 +; CHECK-NEXT: jb .LBB1_3 +; CHECK-NEXT:.LBB1_2: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movq $0, (%rsp) +; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-NEXT: cmpq %rsp, %r11 +; CHECK-NEXT: jb .LBB1_2 +; CHECK-NEXT:.LBB1_3: +; CHECK-NEXT: movq %r11, %rsp +; CHECK-NEXT: movq $0, (%rsp) +; CHECK-NEXT:.LBB1_4: +; CHECK-NEXT: movq %rsp, %r11 +; CHECK-NEXT: subq $65536, %r11 # imm = 0x10000 +; CHECK-NEXT:.LBB1_5: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-NEXT: movq $0, (%rsp) +; CHECK-NEXT: cmpq %r11, %rsp +; CHECK-NEXT: jne .LBB1_5 +; CHECK-NEXT:# %bb.6: +; CHECK-NEXT: movl $1, 392(%rsp) +; CHECK-NEXT: movl (%rsp), %eax +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: retq + + + + + %a = alloca i32, i64 100, align 65536 + %b = getelementptr inbounds i32, i32* %a, i64 98 + store volatile i32 1, i32* %b + %c = load volatile i32, i32* %a + ret i32 %c +} + +attributes #0 = {"probe-stack"="inline-asm"}