Index: llvm/lib/Target/AArch64/AArch64FrameLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -161,6 +161,20 @@ /// Emit target zero call-used regs. void emitZeroCallUsedRegs(BitVector RegsToZero, MachineBasicBlock &MBB) const override; + + /// Replace a StackProbe stub (if any) with the actual probe code inline + void inlineStackProbe(MachineFunction &MF, + MachineBasicBlock &PrologueMBB) const override; + MachineBasicBlock::iterator + inlineStackProbeFixed(MachineBasicBlock::iterator MBBI) const; + + MachineBasicBlock::iterator + inlineStackProbeVar(MachineBasicBlock::iterator MBBI) const; + + MachineBasicBlock::iterator + inlineStackProbeLoopExactMultiple(MachineBasicBlock::iterator MBBI, + int64_t NegProbeSize, + Register TargetReg) const; }; } // End llvm namespace Index: llvm/lib/Target/AArch64/AArch64FrameLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -854,9 +854,11 @@ MachineBasicBlock *TmpMBB = const_cast(&MBB); const AArch64Subtarget &Subtarget = MF->getSubtarget(); const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + const AArch64TargetLowering *TLI = Subtarget.getTargetLowering(); - // Don't need a scratch register if we're not going to re-align the stack. - if (!RegInfo->hasStackRealignment(*MF)) + // Don't need a scratch register if we're not going to re-align the stack or + // emit stack probes. + if (!RegInfo->hasStackRealignment(*MF) && TLI->hasInlineStackProbe(*MF)) return true; // Otherwise, we can use any block as long as it has a scratch register // available. @@ -1428,6 +1430,7 @@ const Function &F = MF.getFunction(); const AArch64Subtarget &Subtarget = MF.getSubtarget(); const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + const AArch64TargetLowering &TLI = *Subtarget.getTargetLowering(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineModuleInfo &MMI = MF.getMMI(); AArch64FunctionInfo *AFI = MF.getInfo(); @@ -1783,12 +1786,14 @@ } } - StackOffset AllocateBefore = SVEStackSize, AllocateAfter = {}; + StackOffset SVECalleeSavedSize = {}, SVELocalsSize = SVEStackSize; MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI; // Process the SVE callee-saves to determine what space needs to be // allocated. if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) { + LLVM_DEBUG(dbgs() << "SVECalleeSavedStackSize = " << CalleeSavedSize + << "\n"); // Find callee save instructions in frame. CalleeSavesBegin = MBBI; assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction"); @@ -1796,33 +1801,65 @@ ++MBBI; CalleeSavesEnd = MBBI; - AllocateBefore = StackOffset::getScalable(CalleeSavedSize); - AllocateAfter = SVEStackSize - AllocateBefore; - } - - // Allocate space for the callee saves (if any). - emitFrameOffset( - MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP, -AllocateBefore, TII, - MachineInstr::FrameSetup, false, false, nullptr, - EmitAsyncCFI && !HasFP && AllocateBefore, - StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes)); + SVECalleeSavedSize = StackOffset::getScalable(CalleeSavedSize); + SVELocalsSize = SVEStackSize - SVECalleeSavedSize; - if (EmitAsyncCFI) - emitCalleeSavedSVELocations(MBB, CalleeSavesEnd); + // Allocate space for the SVE callee saves. + // This space doesn't need stack probing, because it will all be written to + // when saving the CSRs and the SVE CSRs are saved in descending address + // order. + emitFrameOffset( + MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP, + -SVECalleeSavedSize, TII, MachineInstr::FrameSetup, false, false, + nullptr, EmitAsyncCFI && !HasFP, + StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes)); + if (EmitAsyncCFI) + emitCalleeSavedSVELocations(MBB, CalleeSavesEnd); + } - // Finally allocate remaining SVE stack space. - emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP, - -AllocateAfter, TII, MachineInstr::FrameSetup, false, false, - nullptr, EmitAsyncCFI && !HasFP && AllocateAfter, - AllocateBefore + StackOffset::getFixed( - (int64_t)MFI.getStackSize() - NumBytes)); + // Allocate stack space for the local SVE objects. + if (SVELocalsSize) { + if (TLI.hasInlineStackProbe(MF)) { + Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB); + assert(ScratchReg != AArch64::NoRegister); + // Save current SP to the scratch register. + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), ScratchReg) + .addReg(AArch64::SP) + .addImm(0) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) + .setMIFlags(MachineInstr::FrameSetup); + emitFrameOffset( + MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP, -SVELocalsSize, + TII, MachineInstr::FrameSetup, false, false, nullptr, + EmitAsyncCFI && !HasFP, + SVECalleeSavedSize + + StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes)); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::PROBED_STACKALLOC_VAR), + ScratchReg) + .addReg(AArch64::SP) + .addImm(0); + } else { + emitFrameOffset( + MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP, -SVELocalsSize, + TII, MachineInstr::FrameSetup, false, false, nullptr, + EmitAsyncCFI && !HasFP, + SVECalleeSavedSize + + StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes)); + } + } // Allocate space for the rest of the frame. if (NumBytes) { unsigned scratchSPReg = AArch64::SP; + bool NeedsStackProbe = TLI.hasInlineStackProbe(MF) && + (NumBytes > TLI.getStackProbeMaxUnprobedStack(MF) || + MFI.hasVarSizedObjects()); if (NeedsRealignment) { scratchSPReg = findScratchNonCalleeSaveRegister(&MBB); + NeedsStackProbe |= TLI.hasInlineStackProbe(MF) && + (NumBytes + MFI.getMaxAlign().value()) > + TLI.getStackProbeMaxUnprobedStack(MF); assert(scratchSPReg != AArch64::NoRegister); } @@ -1831,12 +1868,25 @@ // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have // the correct value here, as NumBytes also includes padding bytes, // which shouldn't be counted here. - emitFrameOffset( - MBB, MBBI, DL, scratchSPReg, AArch64::SP, - StackOffset::getFixed(-NumBytes), TII, MachineInstr::FrameSetup, - false, NeedsWinCFI, &HasWinCFI, EmitAsyncCFI && !HasFP, + StackOffset CFAOffset = SVEStackSize + - StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes)); + StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes); + if (NeedsStackProbe && !NeedsRealignment) { + // If we don't need to re-align the stack, we can use a more efficient + // sequence for stack probing. + Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB); + assert(ScratchReg != AArch64::NoRegister); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::PROBED_STACKALLOC)) + .addDef(ScratchReg) + .addImm(NumBytes) + .addImm(CFAOffset.getFixed()) + .addImm(CFAOffset.getScalable()); + } else { + emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, + StackOffset::getFixed(-NumBytes), TII, + MachineInstr::FrameSetup, false, NeedsWinCFI, + &HasWinCFI, EmitAsyncCFI && !HasFP, CFAOffset); + } } if (NeedsRealignment) { assert(MFI.getMaxAlign() > Align(1)); @@ -1845,12 +1895,34 @@ // SUB X9, SP, NumBytes // -- X9 is temporary register, so shouldn't contain any live data here, // -- free to use. This is already produced by emitFrameOffset above. - // AND SP, X9, 0b11111...0000 - uint64_t AndMask = ~(MFI.getMaxAlign().value() - 1); - BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP) - .addReg(scratchSPReg, RegState::Kill) - .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64)); + const uint64_t MaxAlign = MFI.getMaxAlign().value(); + const uint64_t AndMask = ~(MaxAlign - 1); + + if (NeedsStackProbe) { + // AND X9, X9, 0b11111...0000 + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), scratchSPReg) + .addReg(scratchSPReg, RegState::Kill) + .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64)) + .setMIFlags(MachineInstr::FrameSetup); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::PROBED_STACKALLOC_VAR), + AArch64::SP) + .addReg(scratchSPReg) + .addImm(NumBytes + MaxAlign) + .setMIFlags(MachineInstr::FrameSetup); + // MOV SP, X9 + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), AArch64::SP) + .addReg(scratchSPReg) + .addImm(0) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) + .setMIFlags(MachineInstr::FrameSetup); + } else { + // AND SP, X9, 0b11111...0000 + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP) + .addReg(scratchSPReg, RegState::Kill) + .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64)) + .setMIFlags(MachineInstr::FrameSetup); + } AFI->setStackRealigned(true); // No need for SEH instructions here; if we're realigning the stack, @@ -2809,7 +2881,8 @@ } return true; } - for (const RegPairInfo &RPI : llvm::reverse(RegPairs)) { + + auto EmitMI = [&](const RegPairInfo &RPI) { unsigned Reg1 = RPI.Reg1; unsigned Reg2 = RPI.Reg2; unsigned StrOpc; @@ -2828,30 +2901,30 @@ Align Alignment; switch (RPI.Type) { case RegPairInfo::GPR: - StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui; - Size = 8; - Alignment = Align(8); - break; + StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui; + Size = 8; + Alignment = Align(8); + break; case RegPairInfo::FPR64: - StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui; - Size = 8; - Alignment = Align(8); - break; + StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui; + Size = 8; + Alignment = Align(8); + break; case RegPairInfo::FPR128: - StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui; - Size = 16; - Alignment = Align(16); - break; + StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui; + Size = 16; + Alignment = Align(16); + break; case RegPairInfo::ZPR: - StrOpc = AArch64::STR_ZXI; - Size = 16; - Alignment = Align(16); - break; + StrOpc = AArch64::STR_ZXI; + Size = 16; + Alignment = Align(16); + break; case RegPairInfo::PPR: - StrOpc = AArch64::STR_PXI; - Size = 2; - Alignment = Align(2); - break; + StrOpc = AArch64::STR_PXI; + Size = 2; + Alignment = Align(2); + break; } LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI); if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI); @@ -2896,7 +2969,22 @@ MachineFrameInfo &MFI = MF.getFrameInfo(); if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR) MFI.setStackID(RPI.FrameIdx, TargetStackID::ScalableVector); + }; + const AArch64TargetLowering &TLI = + *MF.getSubtarget().getTargetLowering(); + if (TLI.hasInlineStackProbe(MF)) { + for (const RegPairInfo &RPI : reverse(RegPairs)) + if (!RPI.isScalable()) + EmitMI(RPI); + // Save the SVE registers from higher to lower addresses, so the saves serve + // as implicit stack probes. + for (const RegPairInfo &RPI : RegPairs) + if (RPI.isScalable()) + EmitMI(RPI); + } else { + for (const RegPairInfo &RPI : llvm::reverse(RegPairs)) + EmitMI(RPI); } return true; } @@ -4040,3 +4128,185 @@ dbgs() << "\n"; }); } + +/// Emit a loop to decrement SP until it is equal to TargetReg, with probes at +/// least every ProbeSize bytes. Returns an iterator of the first instruction +/// after the loop. The difference between SP and TargetReg must be an exact +/// multiple of ProbeSize. +MachineBasicBlock::iterator +AArch64FrameLowering::inlineStackProbeLoopExactMultiple( + MachineBasicBlock::iterator MBBI, int64_t ProbeSize, + Register TargetReg) const { + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineFunction &MF = *MBB.getParent(); + const AArch64InstrInfo *TII = + MF.getSubtarget().getInstrInfo(); + DebugLoc DL = MBB.findDebugLoc(MBBI); + + MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator()); + MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, LoopMBB); + MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, ExitMBB); + + // SUB SP, SP, #ProbeSize (or equivalent if ProbeSize is not encodable + // in SUB). + emitFrameOffset(*LoopMBB, LoopMBB->end(), DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(-ProbeSize), TII, + MachineInstr::FrameSetup); + // STR XZR, [SP] + BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(AArch64::SP) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); + // CMP SP, TargetReg + BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::SUBSXrx64), + AArch64::XZR) + .addReg(AArch64::SP) + .addReg(TargetReg) + .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0)) + .setMIFlags(MachineInstr::FrameSetup); + // B.CC Loop + BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::Bcc)) + .addImm(AArch64CC::NE) + .addMBB(LoopMBB) + .setMIFlags(MachineInstr::FrameSetup); + + LoopMBB->addSuccessor(ExitMBB); + LoopMBB->addSuccessor(LoopMBB); + // Synthesize the exit MBB. + ExitMBB->splice(ExitMBB->end(), &MBB, MBBI, MBB.end()); + ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB); + MBB.addSuccessor(LoopMBB); + // Update liveins. + recomputeLiveIns(*LoopMBB); + recomputeLiveIns(*ExitMBB); + + return ExitMBB->begin(); +} + +static const unsigned STACK_PROBE_LOOP_UNROLL = 3; + +MachineBasicBlock::iterator AArch64FrameLowering::inlineStackProbeFixed( + MachineBasicBlock::iterator MBBI) const { + MachineBasicBlock *MBB = MBBI->getParent(); + MachineFunction &MF = *MBB->getParent(); + const AArch64TargetLowering *TLI = + MF.getSubtarget().getTargetLowering(); + const AArch64InstrInfo *TII = + MF.getSubtarget().getInstrInfo(); + AArch64FunctionInfo *AFI = MF.getInfo(); + bool EmitAsyncCFI = AFI->needsAsyncDwarfUnwindInfo(MF); + bool HasFP = hasFP(MF); + + DebugLoc DL = MBB->findDebugLoc(MBBI); + Register ScratchReg = MBBI->getOperand(0).getReg(); + int64_t FrameSize = MBBI->getOperand(1).getImm(); + StackOffset CFAOffset = StackOffset::get(MBBI->getOperand(2).getImm(), + MBBI->getOperand(3).getImm()); + int64_t ProbeSize = TLI->getStackProbeSize(MF); + int64_t NumBlocks = FrameSize / ProbeSize; + int64_t ResidualSize = FrameSize % ProbeSize; + + LLVM_DEBUG(dbgs() << "Stack probing: total " << FrameSize << " bytes, " + << NumBlocks << " blocks of " << ProbeSize + << " bytes, plus " << ResidualSize << " bytes\n"); + + // Decrement SP by NumBlock * ProbeSize bytes, with either unrolled or + // ordinary loop. + if (NumBlocks <= STACK_PROBE_LOOP_UNROLL) { + for (int i = 0; i < NumBlocks; ++i) { + // SUB SP, SP, #FrameSize (or equivalent if FrameSize is not + // encodable in a SUB). + emitFrameOffset(*MBB, MBBI, DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(-ProbeSize), TII, + MachineInstr::FrameSetup, false, false, nullptr, + EmitAsyncCFI && !HasFP, CFAOffset); + CFAOffset += StackOffset::getFixed(ProbeSize); + // STR XZR, [SP] + BuildMI(*MBB, MBBI, DL, TII->get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(AArch64::SP) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); + } + } else if (NumBlocks != 0) { + // SUB ScratchReg, SP, #FrameSize (or equivalent if FrameSize is not + // encodable in ADD). ScrathReg may temporarily become the CFA register. + emitFrameOffset(*MBB, MBBI, DL, ScratchReg, AArch64::SP, + StackOffset::getFixed(-ProbeSize * NumBlocks), TII, + MachineInstr::FrameSetup, false, false, nullptr, + EmitAsyncCFI && !HasFP, CFAOffset); + CFAOffset += StackOffset::getFixed(ProbeSize * NumBlocks); + MBBI = inlineStackProbeLoopExactMultiple(MBBI, ProbeSize, ScratchReg); + MBB = MBBI->getParent(); + if (EmitAsyncCFI && !HasFP) { + // Set the CFA register back to SP. + const AArch64RegisterInfo &RegInfo = + *MF.getSubtarget().getRegisterInfo(); + unsigned Reg = RegInfo.getDwarfRegNum(AArch64::SP, true); + unsigned CFIIndex = + MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, Reg)); + BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); + } + } + + if (ResidualSize != 0) { + // SUB SP, SP, #ResidualSize (or equivalent if ResidualSize is not encodable + // in SUB). + emitFrameOffset(*MBB, MBBI, DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(-ResidualSize), TII, + MachineInstr::FrameSetup, false, false, nullptr, + EmitAsyncCFI && !HasFP, CFAOffset); + if (ResidualSize > TLI->getStackProbeMaxUnprobedStack(MF)) { + // STR XZR, [SP] + BuildMI(*MBB, MBBI, DL, TII->get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(AArch64::SP) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); + } + } + + MachineBasicBlock::iterator Next = std::next(MBBI); + MBBI->eraseFromParent(); + return Next; +} + +MachineBasicBlock::iterator AArch64FrameLowering::inlineStackProbeVar( + MachineBasicBlock::iterator MBBI) const { + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineFunction &MF = *MBB.getParent(); + const AArch64InstrInfo *TII = + MF.getSubtarget().getInstrInfo(); + + DebugLoc DL = MBB.findDebugLoc(MBBI); + Register ScratchReg = MBBI->getOperand(0).getReg(); + Register TargetReg = MBBI->getOperand(1).getReg(); + int64_t AllocSizeMax = MBBI->getOperand(2).getImm(); + MachineBasicBlock::iterator NextInst = std::next(MBBI); + + NextInst = + TII->insertStackProbingLoop(MBBI, ScratchReg, TargetReg, AllocSizeMax); + + MBBI->eraseFromParent(); + return NextInst; +} + +void AArch64FrameLowering::inlineStackProbe(MachineFunction &MF, + MachineBasicBlock &MBB) const { + for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E;) { + if (MBBI->getOpcode() == AArch64::PROBED_STACKALLOC) { + MBBI = inlineStackProbeFixed(MBBI); + E = MBBI->getParent()->end(); + } else if (MBBI->getOpcode() == AArch64::PROBED_STACKALLOC_VAR) { + MBBI = inlineStackProbeVar(MBBI); + E = MBBI->getParent()->end(); + } else { + ++MBBI; + } + } +} Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -942,6 +942,19 @@ // used for 64bit and 128bit vectors as well. bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON = false) const; + /// True if stack clash protection is enabled for this functions. + bool hasInlineStackProbe(const MachineFunction &MF) const override; + + /// Get the interval between stack-clash probes, which is equal to the stack + /// guard size, in bytes. + unsigned getStackProbeSize(const MachineFunction &MF) const; + + /// Get the maximum allowed number of unprobed bytes above SP at an ABI + /// boundary. + unsigned getStackProbeMaxUnprobedStack(const MachineFunction &MF) const { + return 1024; + } + private: /// Keep a pointer to the AArch64Subtarget around so that we can /// make the right decision when generating code for different targets. Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -26088,3 +26088,37 @@ } return true; } + +bool AArch64TargetLowering::hasInlineStackProbe( + const MachineFunction &MF) const { + // If the function specifically requests inline stack probes, emit them. + if (MF.getFunction().hasFnAttribute("probe-stack")) { + if (MF.getFunction().getFnAttribute("probe-stack").getValueAsString() == + "inline-asm") + return true; + else + llvm_unreachable("Unsupported stack probing method"); + } + + return false; +} + +unsigned +AArch64TargetLowering::getStackProbeSize(const MachineFunction &MF) const { + const TargetFrameLowering *TFI = Subtarget->getFrameLowering(); + unsigned StackAlign = TFI->getStackAlignment(); + assert(StackAlign >= 1 && isPowerOf2_32(StackAlign) && + "Unexpected stack alignment"); + // The default stack probe size is 4096 if the function has no + // stack-probe-size attribute. This is a safe default because it is the + // smallest possible guard page size. + unsigned StackProbeSize = 4096; + const Function &Fn = MF.getFunction(); + if (Fn.hasFnAttribute("stack-probe-size")) + Fn.getFnAttribute("stack-probe-size") + .getValueAsString() + .getAsInteger(0, StackProbeSize); + // Round down to the stack alignment. + StackProbeSize &= ~(StackAlign - 1); + return StackProbeSize ? StackProbeSize : StackAlign; +} Index: llvm/lib/Target/AArch64/AArch64InstrInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -347,6 +347,11 @@ static void decomposeStackOffsetForDwarfOffsets(const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized); + + MachineBasicBlock::iterator + insertStackProbingLoop(MachineBasicBlock::iterator MBBI, Register ScratchReg, + Register TargetReg, int64_t AllocSizeMax) const; + #define GET_INSTRINFO_HELPER_DECLS #include "AArch64GenInstrInfo.inc" Index: llvm/lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -20,6 +20,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineCombinerPattern.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -8461,6 +8462,106 @@ return AArch64::BLR; } +MachineBasicBlock::iterator AArch64InstrInfo::insertStackProbingLoop( + MachineBasicBlock::iterator MBBI, Register ScratchReg, Register TargetReg, + int64_t AllocSizeMax) const { + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineFunction &MF = *MBB.getParent(); + const AArch64TargetLowering *TLI = + MF.getSubtarget().getTargetLowering(); + const AArch64InstrInfo *TII = + MF.getSubtarget().getInstrInfo(); + int64_t ProbeSize = (int64_t)TLI->getStackProbeSize(MF); + DebugLoc DL = MBB.findDebugLoc(MBBI); + + // The difference ScratchReg - TargetReg < AllocSizeMax. If that size is less + // than or equal to StackProbeSize we don't need to create a loop. + if (AllocSizeMax && AllocSizeMax <= ProbeSize) { + // STR XZR, [TargetReg] + BuildMI(MBB, MBBI, DL, TII->get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(TargetReg) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); + return std::next(MBBI); + } + + MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator()); + MachineBasicBlock *LoopTestMBB = + MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, LoopTestMBB); + MachineBasicBlock *LoopBodyMBB = + MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, LoopBodyMBB); + MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, ExitMBB); + + // LoopTest: + // SUB ScratchReg, ScratchReg, #ProbeSize + emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, ScratchReg, ScratchReg, + StackOffset::getFixed(-ProbeSize), TII, + MachineInstr::FrameSetup); + + // CMP ScratchReg, TargetReg + AArch64CC::CondCode Cond = AArch64CC::LE; + Register Op1 = ScratchReg; + Register Op2 = TargetReg; + if (Op2 == AArch64::SP) { + assert(Op1 != AArch64::SP && "At most one of the registers can be SP"); + // CMP TargetReg, ScratchReg + std::swap(Op1, Op2); + Cond = AArch64CC::GT; + } + BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64), + AArch64::XZR) + .addReg(Op1) + .addReg(Op2) + .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0)) + .setMIFlags(MachineInstr::FrameSetup); + + // B. LoopExit + BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc)) + .addImm(Cond) + .addMBB(ExitMBB) + .setMIFlags(MachineInstr::FrameSetup); + + // STR XZR, [ScratchReg, #MaxUnprobedStack] + BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(ScratchReg) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); + + // B loop + BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B)) + .addMBB(LoopTestMBB) + .setMIFlags(MachineInstr::FrameSetup); + + LoopTestMBB->addSuccessor(ExitMBB); + LoopTestMBB->addSuccessor(LoopBodyMBB); + LoopBodyMBB->addSuccessor(LoopTestMBB); + // Synthesize the exit MBB. + ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end()); + ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB); + MBB.addSuccessor(LoopTestMBB); + + // Update liveins. + if (MF.getRegInfo().reservedRegsFrozen()) { + recomputeLiveIns(*LoopTestMBB); + recomputeLiveIns(*LoopBodyMBB); + recomputeLiveIns(*ExitMBB); + } + + // LoopExit: + // STR XZR, [TargetReg] + BuildMI(*ExitMBB, ExitMBB->begin(), DL, TII->get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(TargetReg) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); + return ExitMBB->begin(); +} + #define GET_INSTRINFO_HELPERS #define GET_INSTRMAP_INFO #include "AArch64GenInstrInfo.inc" Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -886,7 +886,8 @@ // Miscellaneous instructions. //===----------------------------------------------------------------------===// -let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in { +let hasSideEffects = 1, isCodeGenOnly = 1 in { +let Defs = [SP], Uses = [SP] in { // We set Sched to empty list because we expect these instructions to simply get // removed in most cases. def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), @@ -895,7 +896,23 @@ def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), [(AArch64callseq_end timm:$amt1, timm:$amt2)]>, Sched<[]>; -} // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 + +// Probed stack allocation of a constant size, used in function prologues when +// stack-clash protection is enabled. +def PROBED_STACKALLOC : Pseudo<(outs GPR64:$scratch), + (ins i64imm:$stacksize, i64imm:$fixed_offset, + i64imm:$scalable_offset), + []>, + Sched<[]>; +} // Defs = [SP], Uses = [SP] + +// Probed stack allocation of a variable size, used in function prologues when +// stack-clash protection is enabled. +def PROBED_STACKALLOC_VAR : Pseudo<(outs GPR64sp:$scratch), + (ins GPR64sp:$target, i64imm:$maxsize), + []>, + Sched<[]>; +} //hasSideEffects = 1, isCodeGenOnly = 1 let isReMaterializable = 1, isCodeGenOnly = 1 in { // FIXME: The following pseudo instructions are only needed because remat Index: llvm/test/CodeGen/AArch64/framelayout-sve.mir =================================================================== --- llvm/test/CodeGen/AArch64/framelayout-sve.mir +++ llvm/test/CodeGen/AArch64/framelayout-sve.mir @@ -206,7 +206,7 @@ # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 # CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2 # CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 16, 0 -# CHECK-NEXT: $sp = ANDXri killed $[[TMP]] +# CHECK-NEXT: $sp = frame-setup ANDXri killed $[[TMP]] # CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16 # CHECK-NEXT: $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 @@ -1032,7 +1032,7 @@ # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 # CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1 # CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 16, 0 -# CHECK-NEXT: $sp = ANDXri killed $[[TMP]] +# CHECK-NEXT: $sp = frame-setup ANDXri killed $[[TMP]] # CHECK: $sp = frame-destroy ADDVL_XXI $fp, -18 # CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 4 Index: llvm/test/CodeGen/AArch64/spill-stack-realignment.mir =================================================================== --- llvm/test/CodeGen/AArch64/spill-stack-realignment.mir +++ llvm/test/CodeGen/AArch64/spill-stack-realignment.mir @@ -21,7 +21,7 @@ - { id: 1, size: 4, alignment: 4, local-offset: -68 } # CHECK: body: -# CHECK: $sp = ANDXri killed ${{x[0-9]+}}, 7865 +# CHECK: $sp = frame-setup ANDXri killed ${{x[0-9]+}}, 7865 # CHECK: STRSui $s0, $sp, 0 # CHECK: STRSui $s0, $fp, 7 body: | Index: llvm/test/CodeGen/AArch64/stack-probing-64k.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/stack-probing-64k.ll @@ -0,0 +1,391 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false | FileCheck %s +; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false -global-isel | FileCheck %s + +; Tests for prolog sequences for stack probing, when using a 64KiB stack guard. + +; 64k bytes is the largest frame we can probe in one go. +define void @static_65536(i8** %out) #0 { +; CHECK-LABEL: static_65536: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 65552 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 65536, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 64k+16 bytes, still needs just one probe. +define void @static_65552(ptr %out) #0 { +; CHECK-LABEL: static_65552: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 65552 +; CHECK-NEXT: str xzr, [sp], #-16 +; CHECK-NEXT: .cfi_def_cfa_offset 65568 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 65552, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 64k+1024 bytes, the largest frame which needs just one probe. +define void @static_66560(ptr %out) #0 { +; CHECK-LABEL: static_66560: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 65552 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 66576 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 1040 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 66560, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 64k+1024+16 bytes, the smallest frame which needs two probes. +define void @static_66576(ptr %out) #0 { +; CHECK-LABEL: static_66576: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 65552 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 66592 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 1056 +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 66576, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 2*64k+1024, the largest frame needing two probes. +define void @static_132096(i8** %out) #0 { +; CHECK-LABEL: static_132096: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 65552 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 131088 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 132112 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #32, lsl #12 // =131072 +; CHECK-NEXT: .cfi_def_cfa_offset 1040 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 132096, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 4*64k-16, the largest frame probed without a loop. +define void @static_262128(i8** %out) #0 { +; CHECK-LABEL: static_262128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 65552 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 131088 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 196624 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #15, lsl #12 // =61440 +; CHECK-NEXT: .cfi_def_cfa_offset 258064 +; CHECK-NEXT: sub sp, sp, #4080 +; CHECK-NEXT: .cfi_def_cfa_offset 262144 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #63, lsl #12 // =258048 +; CHECK-NEXT: .cfi_def_cfa_offset 4096 +; CHECK-NEXT: add sp, sp, #4080 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 262128, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 4*64k, smallest frame probed with a loop. +define void @static_262144(i8** %out) #0 { +; CHECK-LABEL: static_262144: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #64, lsl #12 // =262144 +; CHECK-NEXT: .cfi_def_cfa w9, 262160 +; CHECK-NEXT: .LBB6_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.ne .LBB6_1 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #64, lsl #12 // =262144 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 262144, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 4*64k+1024, large enough to use a loop, but not a multiple of 64KiB +; so has a reminder, but no extra probe. +define void @static_263168(i8** %out) #0 { +; CHECK-LABEL: static_263168: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #64, lsl #12 // =262144 +; CHECK-NEXT: .cfi_def_cfa w9, 262160 +; CHECK-NEXT: .LBB7_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.ne .LBB7_1 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 263184 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #64, lsl #12 // =262144 +; CHECK-NEXT: .cfi_def_cfa_offset 1040 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 263168, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 4*64k+1040, large enough to use a loop, has a reminder and +; an extra probe. +define void @static_263183(i8** %out) #0 { +; CHECK-LABEL: static_263183: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #64, lsl #12 // =262144 +; CHECK-NEXT: .cfi_def_cfa w9, 262160 +; CHECK-NEXT: .LBB8_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.ne .LBB8_1 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: sub sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 263200 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #64, lsl #12 // =262144 +; CHECK-NEXT: .cfi_def_cfa_offset 1056 +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 263184, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; A small allocation, but with a very large alignment requirement. We do this +; by moving SP far enough that a sufficiently-aligned block will exist +; somewhere in the stack frame, so must probe the whole of that larger SP move. +define void @static_16_align_131072(i8** %out) #0 { +; CHECK-LABEL: static_16_align_131072: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #31, lsl #12 // =126976 +; CHECK-NEXT: sub x9, x9, #4080 +; CHECK-NEXT: and x9, x9, #0xfffffffffffe0000 +; CHECK-NEXT: .LBB9_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB9_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB9_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB9_1 +; CHECK-NEXT: .LBB9_3: // %entry +; CHECK-NEXT: str xzr, [x9] +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 16, align 131072 + store i8* %vla, i8** %out, align 8 + ret void +} + +; A small allocation, but with a very large alignment requirement which +; is nevertheless small enough as to not need a loop. +define void @static_16_align_8192(i8** %out) #0 { +; CHECK-LABEL: static_16_align_8192: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: sub x9, x9, #4080 +; CHECK-NEXT: and x9, x9, #0xffffffffffffe000 +; CHECK-NEXT: str xzr, [x9] +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 16, align 8192 + store i8* %vla, i8** %out, align 8 + ret void +} + +; A large allocation with a very large alignment requirement which +; is nevertheless small enough as to not need a loop. +define void @static_32752_align_32k(i8** %out) #0 { +; CHECK-LABEL: static_32752_align_32k: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #7, lsl #12 // =28672 +; CHECK-NEXT: sub x9, x9, #4080 +; CHECK-NEXT: and x9, x9, #0xffffffffffff8000 +; CHECK-NEXT: str xzr, [x9] +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 32752, align 32768 + store i8* %vla, i8** %out, align 8 + ret void +} + +attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" } Index: llvm/test/CodeGen/AArch64/stack-probing-sve.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/stack-probing-sve.ll @@ -0,0 +1,406 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s +; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s + +; Test prolog sequences for stack probing when SVE objects are involved. + +; An SVE stack slot needs probing, because we don't know its size at +; compile-time. +define void @sve_1_vector(** %out) #0 { +; CHECK-LABEL: sve_1_vector: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .LBB0_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub x9, x9, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.gt .LBB0_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: str xzr, [x9] +; CHECK-NEXT: b .LBB0_1 +; CHECK-NEXT: .LBB0_3: // %entry +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec = alloca , align 16 + ret void +} + +; As above, but with 4 SVE vectors of stack space. +define void @sve_4_vector(** %out) #0 { +; CHECK-LABEL: sve_4_vector: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: addvl sp, sp, #-4 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG +; CHECK-NEXT: .LBB1_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub x9, x9, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.gt .LBB1_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB1_1 Depth=1 +; CHECK-NEXT: str xzr, [x9] +; CHECK-NEXT: b .LBB1_1 +; CHECK-NEXT: .LBB1_3: // %entry +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: addvl sp, sp, #4 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec1 = alloca , align 16 + %vec2 = alloca , align 16 + %vec3 = alloca , align 16 + %vec4 = alloca , align 16 + ret void +} + +; The area allocated to save callee-saved SVE registers does not need to be +; probed, because it will always be written to, which acts as a probe. +define void @sve_1v_csr( %a) #0 { +; CHECK-LABEL: sve_1v_csr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: .cfi_restore z8 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + call void asm sideeffect "", "~{z8}" () + ret void +} + +define void @sve_4v_csr( %a) #0 { +; CHECK-LABEL: sve_4v_csr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-4 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG +; CHECK-NEXT: str z8, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr z11, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #4 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: .cfi_restore z8 +; CHECK-NEXT: .cfi_restore z9 +; CHECK-NEXT: .cfi_restore z10 +; CHECK-NEXT: .cfi_restore z11 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + call void asm sideeffect "", "~{z8},~{z9},~{z10},~{z11}" () + ret void +} + +define void @sve_1p_csr( %a) #0 { +; CHECK-LABEL: sve_1p_csr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + call void asm sideeffect "", "~{p8}" () + ret void +} + +define void @sve_4p_csr( %a) #0 { +; CHECK-LABEL: sve_4p_csr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p9, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p10, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p11, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr p11, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + call void asm sideeffect "", "~{p8},~{p9},~{p10},~{p11}" () + ret void +} + +; 1 SVE vector, which needs probing, and a 16-byte fixed size object, which +; doesn't. Here the final store of the SVE probing loop gets merged with the +; fixed-size SP decrement, but this doesn't affect probing as the pattern of +; memory access is the same. +define void @sve_1_vector_16_arr(** %out) #0 { +; CHECK-LABEL: sve_1_vector_16_arr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .LBB6_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub x9, x9, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.gt .LBB6_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB6_1 Depth=1 +; CHECK-NEXT: str xzr, [x9] +; CHECK-NEXT: b .LBB6_1 +; CHECK-NEXT: .LBB6_3: // %entry +; CHECK-NEXT: str xzr, [sp], #-16 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 8 * VG +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: .cfi_def_cfa wsp, 32 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec = alloca , align 16 + %arr = alloca i8, i64 16, align 1 + ret void +} + +; 1 SVE stack slot and a 4096-byte stack slot, both of which need probing. +; TODO: This could be optimised by combining the fixed-size offset into the +; loop. +define void @sve_1_vector_4096_arr(** %out) #0 { +; CHECK-LABEL: sve_1_vector_4096_arr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .LBB7_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub x9, x9, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.gt .LBB7_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB7_1 Depth=1 +; CHECK-NEXT: str xzr, [x9] +; CHECK-NEXT: b .LBB7_1 +; CHECK-NEXT: .LBB7_3: // %entry +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x90, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 4112 + 8 * VG +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: .cfi_def_cfa wsp, 4112 +; CHECK-NEXT: add sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec = alloca , align 16 + %arr = alloca i8, i64 4096, align 1 + ret void +} + +; 1 SVE stack slot and a large stack slot, both of which need probing. +; TODO this could be optimised by combining both loops. +define void @sve_1_vector_12288_arr(** %out) #0 { +; CHECK-LABEL: sve_1_vector_12288_arr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .LBB8_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub x9, x9, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.gt .LBB8_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB8_1 Depth=1 +; CHECK-NEXT: str xzr, [x9] +; CHECK-NEXT: b .LBB8_1 +; CHECK-NEXT: .LBB8_3: // %entry +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x90, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 4112 + 8 * VG +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0x90, 0xc0, 0x00, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 8208 + 8 * VG +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 12304 + 8 * VG +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: .cfi_def_cfa wsp, 12304 +; CHECK-NEXT: add sp, sp, #3, lsl #12 // =12288 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec = alloca , align 16 + %arr = alloca i8, i64 12288, align 1 + ret void +} + +; Not tested: SVE stack objects with alignment >16 bytes, which isn't currently +; supported even without stack-probing. + +; 1 SVE vector, which needs probing, and a 16-byte fixed size object, which +; has a large alignment requirement so also needs a probing loop. +define void @sve_1_vector_16_arr_align_8192(** %out) #0 { +; CHECK-LABEL: sve_1_vector_16_arr_align_8192: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .LBB9_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub x9, x9, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.gt .LBB9_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB9_1 Depth=1 +; CHECK-NEXT: str xzr, [x9] +; CHECK-NEXT: b .LBB9_1 +; CHECK-NEXT: .LBB9_3: // %entry +; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub x9, x9, #4080 +; CHECK-NEXT: and x9, x9, #0xffffffffffffe000 +; CHECK-NEXT: .LBB9_4: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB9_6 +; CHECK-NEXT: // %bb.5: // %entry +; CHECK-NEXT: // in Loop: Header=BB9_4 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB9_4 +; CHECK-NEXT: .LBB9_6: // %entry +; CHECK-NEXT: str xzr, [x9] +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec = alloca , align 16 + %arr = alloca i8, i64 16, align 8192 + ret void +} + +; For 64k guard pages, the only difference is the constant subtracted from SP +; in the loop. +define void @sve_64k_guard(** %out) #0 "stack-probe-size"="65536" { +; CHECK-LABEL: sve_64k_guard: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .LBB10_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub x9, x9, #16, lsl #12 // =65536 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.gt .LBB10_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB10_1 Depth=1 +; CHECK-NEXT: str xzr, [x9] +; CHECK-NEXT: b .LBB10_1 +; CHECK-NEXT: .LBB10_3: // %entry +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec = alloca , align 16 + ret void +} + +; Not tested: dynamic allocations of SVE vectors, which don't currently work +; without stack probing. + +attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" } Index: llvm/test/CodeGen/AArch64/stack-probing.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/stack-probing.ll @@ -0,0 +1,474 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false | FileCheck %s +; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false -global-isel | FileCheck %s + +; Tests for prolog sequences for stack probing, when using a 4KiB stack guard. + +; Small stack frame, no probing required. +define void @static_64(i8** %out) #0 { +; CHECK-LABEL: static_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #64 +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #64 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 64, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; At 256 bytes we start to always create a frame pointer. No frame smaller then +; this needs a probe, so we can use the saving of at least one CSR as a probe +; at the top of our frame. +define void @static_256(i8** %out) #0 { +; CHECK-LABEL: static_256: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #272 +; CHECK-NEXT: .cfi_def_cfa_offset 272 +; CHECK-NEXT: str x29, [sp, #256] // 8-byte Folded Spill +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #272 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 256, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; At 1024 bytes, this is the largest frame which doesn't need probing. +define void @static_1024(i8** %out) #0 { +; CHECK-LABEL: static_1024: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 1040 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 1024, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; At 1024+16 bytes, this is the smallest frame which needs probing. +define void @static_1040(i8** %out) #0 { +; CHECK-LABEL: static_1040: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 1056 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 1040, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 4k bytes is the largest frame we can probe in one go. +define void @static_4096(i8** %out) #0 { +; CHECK-LABEL: static_4096: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 4112 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 4096, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 4k+16 bytes, still needs just one probe. +define void @static_4112(i8** %out) #0 { +; CHECK-LABEL: static_4112: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 4112 +; CHECK-NEXT: str xzr, [sp], #-16 +; CHECK-NEXT: .cfi_def_cfa_offset 4128 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 4112, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 4k+1024 bytes, the largest frame which needs just one probe. +define void @static_5120(i8** %out) #0 { +; CHECK-LABEL: static_5120: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 4112 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 5136 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 1040 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 5120, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 4k+1024+16, the smallest frame which needs two probes. +define void @static_5136(i8** %out) #0 { +; CHECK-LABEL: static_5136: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 4112 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 5152 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 1056 +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 5136, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 2*4k+1024, the largest frame needing two probes +define void @static_9216(i8** %out) #0 { +; CHECK-LABEL: static_9216: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 4112 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 8208 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 9232 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #2, lsl #12 // =8192 +; CHECK-NEXT: .cfi_def_cfa_offset 1040 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 9216, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 4*4k-16, the largest frame probed without a loop +define void @static_16368(i8** %out) #0 { +; CHECK-LABEL: static_16368: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 4112 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 8208 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 12304 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #4080 +; CHECK-NEXT: .cfi_def_cfa_offset 16384 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #3, lsl #12 // =12288 +; CHECK-NEXT: .cfi_def_cfa_offset 4096 +; CHECK-NEXT: add sp, sp, #4080 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 16368, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 4*4k, the smallest frame probed with a loop +define void @static_16384(i8** %out) #0 { +; CHECK-LABEL: static_16384: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #4, lsl #12 // =16384 +; CHECK-NEXT: .cfi_def_cfa w9, 16400 +; CHECK-NEXT: .LBB10_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.ne .LBB10_1 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #4, lsl #12 // =16384 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 16384, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 4*4k + 1024, large enough to use a loop, but not a multiple of 4KiB +; so has a reminder, but no extra probe. +define void @static_17408(i8** %out) #0 { +; CHECK-LABEL: static_17408: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #4, lsl #12 // =16384 +; CHECK-NEXT: .cfi_def_cfa w9, 16400 +; CHECK-NEXT: .LBB11_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.ne .LBB11_1 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 17424 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #4, lsl #12 // =16384 +; CHECK-NEXT: .cfi_def_cfa_offset 1040 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 17408, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 4*4k+1040, large enough to use a loop, has a reminder and +; an extra probe. +define void @static_17424(i8** %out) #0 { +; CHECK-LABEL: static_17424: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #4, lsl #12 // =16384 +; CHECK-NEXT: .cfi_def_cfa w9, 16400 +; CHECK-NEXT: .LBB12_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.ne .LBB12_1 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: sub sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 17440 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #4, lsl #12 // =16384 +; CHECK-NEXT: .cfi_def_cfa_offset 1056 +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 17424, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; A small allocation, but with a very large alignment requirement. We do this +; by moving SP far enough that a sufficiently-aligned block will exist +; somewhere in the stack frame, so must probe the whole of that larger SP move. +define void @static_16_align_8192(i8** %out) #0 { +; CHECK-LABEL: static_16_align_8192: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: sub x9, x9, #4080 +; CHECK-NEXT: and x9, x9, #0xffffffffffffe000 +; CHECK-NEXT: .LBB13_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB13_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB13_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB13_1 +; CHECK-NEXT: .LBB13_3: // %entry +; CHECK-NEXT: str xzr, [x9] +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 16, align 8192 + store i8* %vla, i8** %out, align 8 + ret void +} + +; A small allocation with a very large alignment requirement, but +; nevertheless small enough as to not need a loop. +define void @static_16_align_2048(i8** %out) #0 { +; CHECK-LABEL: static_16_align_2048: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #2032 +; CHECK-NEXT: and x9, x9, #0xfffffffffffff800 +; CHECK-NEXT: str xzr, [x9] +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 16, align 2048 + store i8* %vla, i8** %out, align 8 + ret void +} + +; A large(-ish) allocation with a very large alignment requirement, but +; nevertheless small enough as to not need a loop. +define void @static_2032_align_2048(i8** %out) #0 { +; CHECK-LABEL: static_2032_align_2048: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #2032 +; CHECK-NEXT: and x9, x9, #0xfffffffffffff800 +; CHECK-NEXT: str xzr, [x9] +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 2032, align 2048 + store i8* %vla, i8** %out, align 8 + ret void +} + +attributes #0 = {uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none"} +