Index: lib/Target/AArch64/AArch64CallingConvention.td =================================================================== --- lib/Target/AArch64/AArch64CallingConvention.td +++ lib/Target/AArch64/AArch64CallingConvention.td @@ -345,3 +345,9 @@ def CSR_AArch64_RT_MostRegs : CalleeSavedRegs<(add CSR_AArch64_AAPCS, (sequence "X%u", 9, 15))>; +// Darwin stack probing function CSRs. Registers X9-X11 are used, LR since it's +// a call. +def CSR_AArch64_StackProbe_Darwin + : CalleeSavedRegs<(add (sequence "X%u", 0, 8), + (sequence "X%u", 12, 28), FP, SP, + (sequence "Q%u", 0, 31))>; Index: lib/Target/AArch64/AArch64FrameLowering.h =================================================================== --- lib/Target/AArch64/AArch64FrameLowering.h +++ lib/Target/AArch64/AArch64FrameLowering.h @@ -69,6 +69,10 @@ bool enableStackSlotScavenging(const MachineFunction &MF) const override; + void emitStackProbe(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, + unsigned NumBytes) const; + private: bool shouldCombineCSRLocalStackBump(MachineFunction &MF, unsigned StackBumpBytes) const; Index: lib/Target/AArch64/AArch64FrameLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64FrameLowering.cpp +++ lib/Target/AArch64/AArch64FrameLowering.cpp @@ -455,6 +455,67 @@ OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / 8); } +// Emit a stack probing function call at the specified location. +void AArch64FrameLowering::emitStackProbe(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, + unsigned NumBytes) const { + const AArch64Subtarget &Subtarget = MF.getSubtarget(); + const AArch64InstrInfo *TII = + static_cast(Subtarget.getInstrInfo()); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + const std::vector &CSI = MFI.getCalleeSavedInfo(); + + // If the LR has already been saved we don't need to save it before calling + // the probe function. However if it hasn't then the probe will clobber it. + bool LRIsSaved = + std::any_of(CSI.begin(), CSI.end(), [](const CalleeSavedInfo &SI) { + return SI.getReg() == AArch64::LR; + }); + if (!LRIsSaved) { + // LR wasn't saved as a CSR, so we need to save it ourselves. We don't + // bother updating SP, as we know the probe function won't modify any + // memory. + BuildMI(MBB, MBBI, DL, TII->get(AArch64::STRXpre)) + .addReg(AArch64::SP, RegState::Define) + .addReg(AArch64::LR) + .addReg(AArch64::SP) + .addImm(-16) // Keep SP 16 byte aligned. + .setMIFlags(MachineInstr::FrameSetup); + } + + StringRef Symbol = Subtarget.getTargetLowering()->getStackProbeSymbolName(MF); + // We pass the number of bytes to check to the probe function in register + // W9, a temporary register that we can use in places like the prolog. The + // probe function should preserve all registers except X9, X10, X11 and LR. + + // To materalize the probe size, we emit a MOVi32imm pseudo-instruction which + // will later get expanded into either an ORR Wd, Wzr, #bimm32 or into a + // sequence of MOV instructions depending on the value. + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi32imm)) + .addReg(AArch64::W9, RegState::Define) + .addImm(NumBytes) + .setMIFlags(MachineInstr::FrameSetup); + + BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL)) + .addExternalSymbol(MF.createExternalSymbolName(Symbol)) + .addRegMask(RegInfo->getDarwinStackProbePreservedMask()) + .addReg(AArch64::X9, RegState::Implicit) + .setMIFlags(MachineInstr::FrameSetup); + + if (!LRIsSaved) { + // Restore LR. + BuildMI(MBB, MBBI, DL, TII->get(AArch64::LDRXpost)) + .addReg(AArch64::SP, RegState::Define) + .addReg(AArch64::LR, RegState::Define) + .addReg(AArch64::SP) + .addImm(16) + .setMIFlags(MachineInstr::FrameSetup); + } +} + void AArch64FrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.begin(); @@ -477,6 +538,11 @@ if (MF.getFunction()->getCallingConv() == CallingConv::GHC) return; + // Currently only Darwin platforms support stack probing on AArch64, + // with a fixed probe size of 4096 bytes. + const int StackProbeSize = 4096; + StringRef ProbeSym = Subtarget.getTargetLowering()->getStackProbeSymbolName(MF); + int NumBytes = (int)MFI.getStackSize(); if (!AFI->hasStackFrame()) { assert(!HasFP && "unexpected function without stack frame but with FP"); @@ -491,6 +557,8 @@ if (canUseRedZone(MF)) ++NumRedZoneFunctions; else { + if (NumBytes > StackProbeSize && ProbeSym != "") + emitStackProbe(MF, MBB, MBBI, DL, NumBytes); emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII, MachineInstr::FrameSetup); @@ -515,6 +583,8 @@ AFI->setLocalStackSize(NumBytes - PrologueSaveSize); bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); + // If we're going to combine SP updates then the stack adjustment must be less + // than 512 bytes, hence stack probing in the prologue is unnecssary. if (CombineSPBump) { emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII, MachineInstr::FrameSetup); @@ -560,13 +630,21 @@ assert(scratchSPReg != AArch64::NoRegister); } - // If we're a leaf function, try using the red zone. - if (!canUseRedZone(MF)) - // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have - // the correct value here, as NumBytes also includes padding bytes, - // which shouldn't be counted here. + if (NumBytes > StackProbeSize && ProbeSym != "") { + // We need to emit a call to the stack probe function. Note that we still + // need to adjust SP, the probing doesn't modify it. + emitStackProbe(MF, MBB, MBBI, DL, NumBytes); emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII, MachineInstr::FrameSetup); + } else { + // If we're a leaf function, try using the red zone. + if (!canUseRedZone(MF)) + // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have + // the correct value here, as NumBytes also includes padding bytes, + // which shouldn't be counted here. + emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, + TII, MachineInstr::FrameSetup); + } if (NeedsRealignment) { const unsigned Alignment = MFI.getMaxAlignment(); Index: lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.h +++ lib/Target/AArch64/AArch64ISelLowering.h @@ -477,6 +477,9 @@ bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override; + + StringRef getStackProbeSymbolName(MachineFunction &MF) const override; + private: bool isExtFreeImpl(const Instruction *Ext) const override; @@ -594,6 +597,7 @@ SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, std::vector *Created) const override; Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -253,7 +253,10 @@ // Variable-sized objects. setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); - setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); + if (Subtarget->isTargetDarwin() && TM.Options.EnableStackProbe) + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom); + else + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); // Constant pool entries setOperationAction(ISD::ConstantPool, MVT::i64, Custom); @@ -2685,6 +2688,8 @@ case ISD::VECREDUCE_FMAX: case ISD::VECREDUCE_FMIN: return LowerVECREDUCE(Op, DAG); + case ISD::DYNAMIC_STACKALLOC: + return LowerDYNAMIC_STACKALLOC(Op, DAG); } } @@ -7367,6 +7372,51 @@ } } +SDValue +AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG) const { + assert(Subtarget->isTargetDarwin() && + "Only Darwin dynamic alloca probing supported"); + SDLoc dl(Op); + // Get the inputs. + SDNode *Node = Op.getNode(); + SDValue Chain = Op.getOperand(0); + SDValue Size = Op.getOperand(1); + unsigned Align = cast(Op.getOperand(2))->getZExtValue(); + EVT VT = Node->getValueType(0); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + + SDValue Callee = DAG.getTargetExternalSymbol("___chkstk_darwin", PtrVT, 0); + + Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); + const auto Mask = + Subtarget->getRegisterInfo()->getDarwinStackProbePreservedMask(); + + Chain = DAG.getCopyToReg(Chain, dl, AArch64::X9, Size, SDValue()); + Chain = + DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue), + Chain, Callee, DAG.getRegister(AArch64::X9, MVT::i64), + DAG.getRegisterMask(Mask), Chain.getValue(1)); + + SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); + Chain = SP.getValue(1); + SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); + Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); + + if (Align) { + SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), + DAG.getConstant(-(uint64_t)Align, dl, VT)); + Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); + } + + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), + DAG.getIntPtrConstant(0, dl, true), SDValue(), dl); + + SDValue Ops[2] = {SP, Chain}; + return DAG.getMergeValues(Ops, dl); +} + + /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment /// specified in the intrinsic calls. @@ -10984,3 +11034,10 @@ return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32; } + +StringRef +AArch64TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const { + if (Subtarget->isTargetDarwin() && MF.getTarget().Options.EnableStackProbe) + return "___chkstk_darwin"; + return ""; +} Index: lib/Target/AArch64/AArch64RegisterInfo.h =================================================================== --- lib/Target/AArch64/AArch64RegisterInfo.h +++ lib/Target/AArch64/AArch64RegisterInfo.h @@ -61,6 +61,9 @@ const uint32_t *getThisReturnPreservedMask(const MachineFunction &MF, CallingConv::ID) const; + /// Stack probing calls preserve different CSRs to the normal CC. + const uint32_t *getDarwinStackProbePreservedMask() const; + BitVector getReservedRegs(const MachineFunction &MF) const override; bool isConstantPhysReg(unsigned PhysReg) const override; const TargetRegisterClass * Index: lib/Target/AArch64/AArch64RegisterInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64RegisterInfo.cpp +++ lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -114,6 +114,10 @@ return CSR_AArch64_AAPCS_ThisReturn_RegMask; } +const uint32_t *AArch64RegisterInfo::getDarwinStackProbePreservedMask() const { + return CSR_AArch64_StackProbe_Darwin_RegMask; +} + BitVector AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const { const AArch64FrameLowering *TFI = getFrameLowering(MF); Index: test/CodeGen/AArch64/arm64-stack-probing.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/arm64-stack-probing.ll @@ -0,0 +1,70 @@ +; RUN: llc < %s -mtriple=arm64-apple-darwin -verify-machineinstrs -stack-probe | FileCheck %s +; RUN: llc < %s -mtriple=arm64-apple-darwin -verify-machineinstrs | FileCheck --check-prefix=DISABLED %s +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-darwin" + +declare i32 @use_ptr(i32*) #1 + +; Expect a probe here due to static object size > 4096. +; Function Attrs: noinline nounwind optnone uwtable +define void @static_test1_probe() #0 { +; CHECK-LABEL: static_test1_probe +; CHECK: orr w9, wzr, #0x4000 +; CHECK-NEXT: bl ____chkstk_darwin +; CHECK-NEXT: sub sp, sp, #4, lsl #12 +; DISABLED-NOT: bl ____chkstk_darwin + %1 = alloca [4096 x i32], align 4 + %2 = getelementptr inbounds [4096 x i32], [4096 x i32]* %1, i32 0, i32 0 + %3 = call i32 @use_ptr(i32* %2) + ret void +} + +; Stack size should be less than 4k, no probe. +; Function Attrs: noinline nounwind optnone uwtable +define void @static_test2_small() #0 { +; CHECK-LABEL: static_test2_small +; CHECK-NOT: bl ____chkstk_darwin + %1 = alloca [64 x i32], align 4 + %2 = getelementptr inbounds [64 x i32], [64 x i32]* %1, i32 0, i32 0 + %3 = call i32 @use_ptr(i32* %2) + ret void +} + +@g = common local_unnamed_addr global i32* null, align 8 + +; Check that the LR is saved in the prolog for static allocas when it isn't +; otherwise saved as a normal callee-save reg. +; Function Attrs: nounwind optsize ssp uwtable +define void @test_static_oversize_nocsr(i32* nocapture readnone) local_unnamed_addr #0 { +; CHECK-LABEL: test_static_oversize_nocsr +; CHECK: stp x28, x27, [sp, #-16]! +; CHECK-NEXT: str x30, [sp, #-16]! +; CHECK-NEXT: mov w9, #8000 +; CHECK-NEXT: bl ____chkstk_darwin +; CHECK-NEXT: ldr x30, [sp], #16 + %2 = alloca [2000 x i32], align 4 + %3 = bitcast [2000 x i32]* %2 to i32* + store i32* %3, i32** @g, align 8 + ret void +} + +; Test dynamic sized allocas. +; Function Attrs: nounwind optsize ssp uwtable +define void @test_dynamic(i32* nocapture readnone, i64 %num) local_unnamed_addr #0 { +; CHECK-LABEL: test_dynamic +; CHECK: add x8, x1, #15 +; CHECK-NEXT: and x8, x8, #0xfffffffffffffff0 +; CHECK-NEXT: mov x9, x8 +; CHECK-NEXT: bl ____chkstk_darwin +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: subs x8, x9, x8 + %2 = alloca i8, i64 %num, align 16 + %3 = bitcast i8* %2 to i32* + store i32* %3, i32** @g, align 8 + ret void +} + + +attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+crypto,+fp-armv8,+neon,+zcm,+zcz" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+crypto,+fp-armv8,+neon,+zcm,+zcz" "unsafe-fp-math"="false" "use-soft-float"="false" } +