Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -72,6 +72,10 @@ ADC, SBC, // adc, sbc instructions + // To avoid stack clash, allocation is performed by block and each block is + // probed. + PROBED_ALLOCA, + // Predicated instructions where inactive lanes produce undefined results. ADD_PRED, FADD_PRED, @@ -529,6 +533,8 @@ MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitDynamicProbedAlloc(MachineInstr &MI, + MachineBasicBlock *MBB) const; MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, @@ -944,9 +950,8 @@ SDValue LowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain, - SDValue &Size, - SelectionDAG &DAG) const; + SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSVEStructLoad(unsigned Intrinsic, ArrayRef LoadOps, EVT VT, SelectionDAG &DAG, const SDLoc &DL) const; @@ -1050,6 +1055,9 @@ // to transition between unpacked and packed types of the same element type, // with BITCAST used otherwise. SDValue getSVESafeBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) const; + + MachineBasicBlock *emitProbedAlloca(MachineInstr &MI, + MachineBasicBlock *MBB) const; }; namespace AArch64 { Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -467,11 +467,7 @@ // Variable-sized objects. setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); - - if (Subtarget->isTargetWindows()) - setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom); - else - setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom); // Constant pool entries setOperationAction(ISD::ConstantPool, MVT::i64, Custom); @@ -1732,6 +1728,7 @@ MAKE_CASE(AArch64ISD::CSINC) MAKE_CASE(AArch64ISD::THREAD_POINTER) MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ) + MAKE_CASE(AArch64ISD::PROBED_ALLOCA) MAKE_CASE(AArch64ISD::ADD_PRED) MAKE_CASE(AArch64ISD::MUL_PRED) MAKE_CASE(AArch64ISD::SDIV_PRED) @@ -2061,6 +2058,21 @@ return BB; } +MachineBasicBlock *AArch64TargetLowering::EmitDynamicProbedAlloc( + MachineInstr &MI, MachineBasicBlock *MBB) const { + MachineFunction &MF = *MBB->getParent(); + const AArch64InstrInfo &TII = + *static_cast(Subtarget->getInstrInfo()); + DebugLoc DL = MI.getDebugLoc(); + + Register TargetReg = MI.getOperand(0).getReg(); + int64_t ProbeSize = getStackProbeSize(MF); + + MachineBasicBlock::iterator NextInst = TII.insertStackProbingLoop(MI, -ProbeSize, TargetReg); + MI.eraseFromParent(); + return NextInst->getParent(); +} + MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( MachineInstr &MI, MachineBasicBlock *BB) const { switch (MI.getOpcode()) { @@ -2080,6 +2092,9 @@ case AArch64::CATCHRET: return EmitLoweredCatchRet(MI, BB); + + case AArch64::PROBED_STACKALLOC_DYN: + return EmitDynamicProbedAlloc(MI, BB); } } @@ -10460,9 +10475,31 @@ AN->getMemOperand()); } -SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC( - SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const { +SDValue +AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG) const { SDLoc dl(Op); + // Get the inputs. + SDNode *Node = Op.getNode(); + SDValue Chain = Op.getOperand(0); + SDValue Size = Op.getOperand(1); + MaybeAlign Align = + cast(Op.getOperand(2))->getMaybeAlignValue(); + EVT VT = Node->getValueType(0); + + if (DAG.getMachineFunction().getFunction().hasFnAttribute( + "no-stack-arg-probe")) { + SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); + Chain = SP.getValue(1); + SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); + if (Align) + SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), + DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); + Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); + SDValue Ops[2] = {SP, Chain}; + return DAG.getMergeValues(Ops, dl); + } + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Callee = DAG.getTargetExternalSymbol("__chkstk", PtrVT, 0); @@ -10471,6 +10508,7 @@ if (Subtarget->hasCustomCallingConv()) TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask); + Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size, DAG.getConstant(4, dl, MVT::i64)); Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue()); @@ -10485,55 +10523,61 @@ Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size, DAG.getConstant(4, dl, MVT::i64)); - return Chain; + + SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); + Chain = SP.getValue(1); + SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); + if (Align) + SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), + DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); + Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); + + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), + DAG.getIntPtrConstant(0, dl, true), SDValue(), dl); + + SDValue Ops[2] = {SP, Chain}; + return DAG.getMergeValues(Ops, dl); } SDValue -AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, - SelectionDAG &DAG) const { - assert(Subtarget->isTargetWindows() && - "Only Windows alloca probing supported"); - SDLoc dl(Op); +AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG) const { // Get the inputs. SDNode *Node = Op.getNode(); SDValue Chain = Op.getOperand(0); - SDValue Size = Op.getOperand(1); + SDValue Size = Op.getOperand(1); MaybeAlign Align = cast(Op.getOperand(2))->getMaybeAlignValue(); + SDLoc dl(Op); EVT VT = Node->getValueType(0); - if (DAG.getMachineFunction().getFunction().hasFnAttribute( - "no-stack-arg-probe")) { - SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); - Chain = SP.getValue(1); - SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); - if (Align) - SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), - DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); - Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); - SDValue Ops[2] = {SP, Chain}; - return DAG.getMergeValues(Ops, dl); - } - - Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); - - Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG); - + // Construct the new SP value in a GPR. SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); Chain = SP.getValue(1); SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); if (Align) SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); - Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); - - Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), - DAG.getIntPtrConstant(0, dl, true), SDValue(), dl); + // Set the real SP to the new value with a probing loop. + Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP); SDValue Ops[2] = {SP, Chain}; return DAG.getMergeValues(Ops, dl); } +SDValue +AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + + if (Subtarget->isTargetWindows()) + return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG); + else if (hasInlineStackProbe(MF)) + return LowerInlineDYNAMIC_STACKALLOC(Op, DAG); + else + return SDValue(); +} + SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -603,6 +603,11 @@ def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>; +def AArch64probedalloca + : SDNode<"AArch64ISD::PROBED_ALLOCA", + SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>, + [SDNPHasChain]>; + //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// @@ -663,6 +668,15 @@ (ins GPR64:$target), []>, Sched<[]>; + +// Probed stack allocations of a variable size, used for allocas of unknown size +// when stack-clash protection is enabled. +let usesCustomInserter = 1 in +def PROBED_STACKALLOC_DYN + : Pseudo<(outs), + (ins GPR64common:$target), + [(AArch64probedalloca GPR64common:$target)]>, + Sched<[]>; } // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 let isReMaterializable = 1, isCodeGenOnly = 1 in { Index: llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll @@ -0,0 +1,217 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s + +; Dynamically-sized allocation, needs a loop which can handle any size at +; runtime. The final iteration of the loop will temporarily put SP below the +; target address, but this doesn't break any of the ABI constraints on the +; stack, and also doesn't probe below the target SP value. +define void @dynamic(i64 %size, i8** %out) "probe-stack"="inline-asm" "frame-pointer"="none" { +; CHECK-LABEL: dynamic: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: add x9, x0, #15 // =15 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: .LBB0_1: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x8 +; CHECK-NEXT: b.le .LBB0_3 +; CHECK-NEXT: // %bb.2: // in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB0_1 +; CHECK-NEXT: .LBB0_3: +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: str x8, [x1] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vla = alloca i8, i64 %size, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; This function has a fixed-size stack slot and a dynamic one. The fixed size +; slot isn't large enough that we would normally probe it, but we need to do so +; here otherwise the gap between the CSR save and the first probe of the +; dynamic allocation could be too far apart when the size of the dynamic +; allocation is close to the guard size. +define void @dynamic_fixed(i64 %size, i8** %out1, i8** %out2) "probe-stack"="inline-asm" "frame-pointer"="none" { +; CHECK-LABEL: dynamic_fixed: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: str xzr, [sp, #-64]! +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, x29, #64 // =64 +; CHECK-NEXT: add x10, x0, #15 // =15 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x9, [x1] +; CHECK-NEXT: and x9, x10, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: .LBB1_1: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x8 +; CHECK-NEXT: b.le .LBB1_3 +; CHECK-NEXT: // %bb.2: // in Loop: Header=BB1_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB1_1 +; CHECK-NEXT: .LBB1_3: +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: str x8, [x2] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vla1 = alloca i8, i64 64, align 1 + store i8* %vla1, i8** %out1, align 8 + %vla2 = alloca i8, i64 %size, align 1 + store i8* %vla2, i8** %out2, align 8 + ret void +} + +; Dynamic allocation, with an alignment requirement greater than the alignment +; of SP. Done by ANDing the target SP with a constant to align it down, then +; doing the loop as normal. Note that we also re-align the stack in the prolog, +; which isn't actually needed because the only aligned allocations are dynamic, +; this is done even without stack probing. +define void @dynamic_align_64(i64 %size, i8** %out) "probe-stack"="inline-asm" "frame-pointer"="none" { +; CHECK-LABEL: dynamic_align_64: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: sub x9, sp, #32 // =32 +; CHECK-NEXT: and x9, x9, #0xffffffffffffffc0 +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB2_3 +; CHECK-NEXT: // %bb.2: // in Loop: Header=BB2_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB2_1 +; CHECK-NEXT: .LBB2_3: +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: add x9, x0, #15 // =15 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: and x8, x8, #0xffffffffffffffc0 +; CHECK-NEXT: .LBB2_4: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x8 +; CHECK-NEXT: b.le .LBB2_6 +; CHECK-NEXT: // %bb.5: // in Loop: Header=BB2_4 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB2_4 +; CHECK-NEXT: .LBB2_6: +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: str x8, [x1] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vla = alloca i8, i64 %size, align 64 + store i8* %vla, i8** %out, align 8 + ret void +} + +; Dynamic allocation, with an alignment greater than the stack guard size. The +; only difference to the dynamic allocation is the constant used for aligning +; the target SP, the loop will probe the whole allocation without needing to +; know about the alignment padding. +define void @dynamic_align_8192(i64 %size, i8** %out) "probe-stack"="inline-asm" "frame-pointer"="none" { +; CHECK-LABEL: dynamic_align_8192: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: sub x9, x9, #4064 // =4064 +; CHECK-NEXT: and x9, x9, #0xffffffffffffe000 +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .LBB3_1: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB3_3 +; CHECK-NEXT: // %bb.2: // in Loop: Header=BB3_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB3_1 +; CHECK-NEXT: .LBB3_3: +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: add x9, x0, #15 // =15 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: and x8, x8, #0xffffffffffffe000 +; CHECK-NEXT: .LBB3_4: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x8 +; CHECK-NEXT: b.le .LBB3_6 +; CHECK-NEXT: // %bb.5: // in Loop: Header=BB3_4 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB3_4 +; CHECK-NEXT: .LBB3_6: +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: str x8, [x1] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vla = alloca i8, i64 %size, align 8192 + store i8* %vla, i8** %out, align 8 + ret void +} + +; For 64k guard pages, the only difference is the constant subtracted from SP +; in the loop. +define void @dynamic_64k_guard(i64 %size, i8** %out) "probe-stack"="inline-asm" "frame-pointer"="none" "stack-probe-size"="65536" { +; CHECK-LABEL: dynamic_64k_guard: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: add x9, x0, #15 // =15 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: .LBB4_1: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: cmp sp, x8 +; CHECK-NEXT: b.le .LBB4_3 +; CHECK-NEXT: // %bb.2: // in Loop: Header=BB4_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB4_1 +; CHECK-NEXT: .LBB4_3: +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: str x8, [x1] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vla = alloca i8, i64 %size, align 1 + store i8* %vla, i8** %out, align 8 + ret void +}