diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -148,6 +148,18 @@ const StackOffset &OffsetFromDefCFA) const; bool shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock &MBB, unsigned StackBumpBytes) const; + + /// Replace a StackProbe stub (if any) with the actual probe code inline + virtual void inlineStackProbe(MachineFunction &MF, + MachineBasicBlock &PrologueMBB) const override; + MachineBasicBlock::iterator + inlineStackProbeFixed(MachineBasicBlock::iterator MBBI) const; + MachineBasicBlock::iterator + inlineStackProbeVar(MachineBasicBlock::iterator MBBI) const; + MachineBasicBlock::iterator + inlineStackProbeLoopExactMultiple(MachineBasicBlock::iterator MBBI, + int64_t NegProbeSize, + Register TargetReg) const; }; } // End llvm namespace diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -1091,6 +1091,7 @@ const Function &F = MF.getFunction(); const AArch64Subtarget &Subtarget = MF.getSubtarget(); const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + const AArch64TargetLowering &TLI = *Subtarget.getTargetLowering(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineModuleInfo &MMI = MF.getMMI(); AArch64FunctionInfo *AFI = MF.getInfo(); @@ -1347,12 +1348,13 @@ NumBytes = 0; } - StackOffset AllocateBefore = SVEStackSize, AllocateAfter = {}; + StackOffset AllocateBefore = {}, AllocateAfter = SVEStackSize; MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI; // Process the SVE callee-saves to determine what space needs to be // allocated. if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) { + LLVM_DEBUG(dbgs() << "SVECalleeSavedStackSize = " << CalleeSavedSize << "\n"); // Find callee save instructions in frame. CalleeSavesBegin = MBBI; assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction"); @@ -1363,22 +1365,42 @@ AllocateBefore = StackOffset::getScalable(CalleeSavedSize); AllocateAfter = SVEStackSize - AllocateBefore; } + LLVM_DEBUG(dbgs() << "AllocateBefore = " << AllocateBefore.getFixed() << ", " + << AllocateBefore.getScalable() << "\n"); + LLVM_DEBUG(dbgs() << "AllocateAfter = " << AllocateAfter.getFixed() << ", " + << AllocateAfter.getScalable() << "\n"); - // Allocate space for the callee saves (if any). + // Allocate space for the SVE callee saves (if any). + // This space doesn't need stack probing, because it will all be written to + // when saving the CSRs. emitFrameOffset(MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP, -AllocateBefore, TII, MachineInstr::FrameSetup); // Finally allocate remaining SVE stack space. - emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP, - -AllocateAfter, TII, - MachineInstr::FrameSetup); + if (TLI.hasInlineStackProbe(MF) && AllocateAfter) { + Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB); + assert(ScratchReg != AArch64::NoRegister); + emitFrameOffset(MBB, CalleeSavesEnd, DL, ScratchReg, AArch64::SP, + -AllocateAfter, TII, MachineInstr::FrameSetup); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::PROBED_STACKALLOC_VAR)) + .addUse(ScratchReg); + } else { + emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP, + -AllocateAfter, TII, MachineInstr::FrameSetup); + } // Allocate space for the rest of the frame. if (NumBytes) { // Alignment is required for the parent frame, not the funclet const bool NeedsRealignment = !IsFunclet && RegInfo->needsStackRealignment(MF); + bool NeedsStackProbe = TLI.hasInlineStackProbe(MF) && + (NumBytes >= TLI.getStackProbeMaxUnprobedStack(MF) || + MFI.hasVarSizedObjects()); + if (NeedsRealignment) + NeedsStackProbe |= TLI.hasInlineStackProbe(MF) && + (NumBytes + MFI.getMaxAlign().value()) >= TLI.getStackProbeMaxUnprobedStack(MF); unsigned scratchSPReg = AArch64::SP; if (NeedsRealignment) { @@ -1387,13 +1409,26 @@ } // If we're a leaf function, try using the red zone. - if (!canUseRedZone(MF)) + if (!canUseRedZone(MF)) { // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have // the correct value here, as NumBytes also includes padding bytes, // which shouldn't be counted here. - emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, - StackOffset::getFixed(-NumBytes), TII, - MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); + if (NeedsStackProbe && !NeedsRealignment) { + // If we don't need to re-align the stack, we can use a more efficient + // sequence for stack probing. + Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB); + assert(ScratchReg != AArch64::NoRegister); + BuildMI(MBB, MBBI, DL, + TII->get(AArch64::PROBED_STACKALLOC)) + .addDef(ScratchReg) + .addImm(-NumBytes); + } else { + emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, + StackOffset::getFixed(-NumBytes), TII, + MachineInstr::FrameSetup, false, NeedsWinCFI, + &HasWinCFI); + } + } if (NeedsRealignment) { const unsigned NrBitsToZero = Log2(MFI.getMaxAlign()); @@ -1411,9 +1446,17 @@ | ((64 - NrBitsToZero) << 6) // immr | ((64 - NrBitsToZero - 1) << 0); // imms - BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP) - .addReg(scratchSPReg, RegState::Kill) - .addImm(andMaskEncoded); + if (NeedsStackProbe) { + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), scratchSPReg) + .addReg(scratchSPReg, RegState::Kill) + .addImm(andMaskEncoded); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::PROBED_STACKALLOC_VAR)) + .addUse(scratchSPReg); + } else { + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP) + .addReg(scratchSPReg, RegState::Kill) + .addImm(andMaskEncoded); + } AFI->setStackRealigned(true); if (NeedsWinCFI) { HasWinCFI = true; @@ -3580,3 +3623,156 @@ dbgs() << "\n"; }); } + +/// Emit a loop to decrement SP until it is equal to TargetReg, with probes at +/// least every NegProbeSize bytes. Returns an iterator of the first instruction +/// after the loop. The difference between SP and TargetReg must be an exact +/// multiple of NegProbeSize. +MachineBasicBlock::iterator +AArch64FrameLowering::inlineStackProbeLoopExactMultiple( + MachineBasicBlock::iterator MBBI, int64_t NegProbeSize, + Register TargetReg) const { + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineFunction &MF = *MBB.getParent(); + const AArch64InstrInfo *TII = + MF.getSubtarget().getInstrInfo(); + DebugLoc DL = MBB.findDebugLoc(MBBI); + + MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator()); + MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, LoopMBB); + MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, ExitMBB); + + // ADD SP, SP, #NegFrameSize (or equivalent if NegFrameSize is not encodable + // in ADD). + emitFrameOffset(*LoopMBB, LoopMBB->end(), DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(NegProbeSize), TII, + MachineInstr::FrameSetup); + // STR XZR, [SP] + BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(AArch64::SP) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); + // CMP SP, XZR + BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::SUBSXrx64), + AArch64::XZR) + .addReg(AArch64::SP) + .addReg(TargetReg) + .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0)) + .setMIFlags(MachineInstr::FrameSetup); + // B.CC Loop + BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::Bcc)) + .addImm(AArch64CC::NE) + .addMBB(LoopMBB) + .setMIFlags(MachineInstr::FrameSetup); + + LoopMBB->addSuccessor(ExitMBB); + LoopMBB->addSuccessor(LoopMBB); + // Synthesize the exit MBB. + ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end()); + ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB); + MBB.addSuccessor(LoopMBB); + // Update liveins. + recomputeLiveIns(*LoopMBB); + recomputeLiveIns(*ExitMBB); + + return ExitMBB->begin(); +} + +MachineBasicBlock::iterator AArch64FrameLowering::inlineStackProbeFixed( + MachineBasicBlock::iterator MBBI) const { + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineFunction &MF = *MBB.getParent(); + const AArch64TargetLowering *TLI = + MF.getSubtarget().getTargetLowering(); + const AArch64InstrInfo *TII = + MF.getSubtarget().getInstrInfo(); + + DebugLoc DL = MBB.findDebugLoc(MBBI); + Register ScratchReg = MBBI->getOperand(0).getReg(); + int64_t NegFrameSize = MBBI->getOperand(1).getImm(); + int64_t NegProbeSize = -(int64_t)TLI->getStackProbeSize(MF); + int64_t NumBlocks = NegFrameSize / NegProbeSize; + int64_t NegResidualSize = NegFrameSize % NegProbeSize; + MachineBasicBlock::iterator NextInst; + LLVM_DEBUG(dbgs() << "Stack probing: total " << NegFrameSize << " bytes, " << + NumBlocks << " blocks of " << NegProbeSize << " bytes, plus " << + NegResidualSize << " bytes\n"); + + if (NegResidualSize != 0) { + // ADD SP, SP, #NegFrameSize (or equivalent if NegFrameSize is not encodable + // in ADD). + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(NegResidualSize), TII, MachineInstr::FrameSetup); + // STR XZR, [SP] + BuildMI(MBB, MBBI, DL, TII->get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(AArch64::SP) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); + } + + if (NumBlocks < 3) { + for (int i = 0; i < NumBlocks; ++i) { + // ADD SP, SP, #NegFrameSize (or equivalent if NegFrameSize is not + // encodable in ADD). + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(NegProbeSize), TII, MachineInstr::FrameSetup); + // STR XZR, [SP] + BuildMI(MBB, MBBI, DL, TII->get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(AArch64::SP) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); + } + NextInst = std::next(MBBI); + } else if (NumBlocks != 0) { + // ADD ScratchReg, SP, #NegFrameSize (or equivalent if NegFrameSize is not + // encodable in ADD). + emitFrameOffset(MBB, MBBI, DL, ScratchReg, AArch64::SP, + StackOffset::getFixed(NegProbeSize * NumBlocks), TII, + MachineInstr::FrameSetup); + NextInst = + inlineStackProbeLoopExactMultiple(MBBI, NegProbeSize, ScratchReg); + } + + MBBI->eraseFromParent(); + return NextInst; +} + +MachineBasicBlock::iterator AArch64FrameLowering::inlineStackProbeVar( + MachineBasicBlock::iterator MBBI) const { + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineFunction &MF = *MBB.getParent(); + const AArch64TargetLowering *TLI = + MF.getSubtarget().getTargetLowering(); + const AArch64InstrInfo *TII = + MF.getSubtarget().getInstrInfo(); + + DebugLoc DL = MBB.findDebugLoc(MBBI); + Register TargetReg = MBBI->getOperand(0).getReg(); + int64_t NegProbeSize = -(int64_t)TLI->getStackProbeSize(MF); + MachineBasicBlock::iterator NextInst = std::next(MBBI); + + NextInst = TII->insertStackProbingLoop(MBBI, NegProbeSize, TargetReg); + + MBBI->eraseFromParent(); + return NextInst; +} + +void AArch64FrameLowering::inlineStackProbe( + MachineFunction &MF, MachineBasicBlock &MBB) const { + for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E;) { + if (MBBI->getOpcode() == AArch64::PROBED_STACKALLOC) { + MBBI = inlineStackProbeFixed(MBBI); + E = MBBI->getParent()->end(); + } else if (MBBI->getOpcode() == AArch64::PROBED_STACKALLOC_VAR) { + MBBI = inlineStackProbeVar(MBBI); + E = MBBI->getParent()->end(); + } else { + ++MBBI; + } + } +} diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -815,6 +815,17 @@ return 128; } + /// True if stack clash protection is enabled for this functions. + bool hasInlineStackProbe(MachineFunction &MF) const override; + + /// Get the interval between stack-clash probes, which is equal to the stack + /// guard size, in bytes. + unsigned getStackProbeSize(MachineFunction &MF) const; + + /// Get the maximum allowed number of unprobed bytes above SP at an ABI + /// boundary. + unsigned getStackProbeMaxUnprobedStack(MachineFunction &MF) const; + private: /// Keep a pointer to the AArch64Subtarget around so that we can /// make the right decision when generating code for different targets. diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -16623,6 +16623,45 @@ return TargetLowering::getSSPStackGuardCheck(M); } +bool AArch64TargetLowering::hasInlineStackProbe(MachineFunction &MF) const { + // If the function specifically requests inline stack probes, emit them. + if (MF.getFunction().hasFnAttribute("probe-stack")) { + if (MF.getFunction().getFnAttribute("probe-stack").getValueAsString() == + "inline-asm") + return true; + else + llvm_unreachable("Unsupported stack probing method"); + } + + return false; +} + +unsigned AArch64TargetLowering::getStackProbeSize(MachineFunction &MF) const { + const TargetFrameLowering *TFI = Subtarget->getFrameLowering(); + unsigned StackAlign = TFI->getStackAlignment(); + assert(StackAlign >= 1 && isPowerOf2_32(StackAlign) && + "Unexpected stack alignment"); + // The default stack probe size is 4096 if the function has no + // stack-probe-size attribute. This is a safe default because it is the + // smallest possible guard page size. + unsigned StackProbeSize = 4096; + const Function &Fn = MF.getFunction(); + if (Fn.hasFnAttribute("stack-probe-size")) + Fn.getFnAttribute("stack-probe-size") + .getValueAsString() + .getAsInteger(0, StackProbeSize); + // Round down to the stack alignment. + StackProbeSize &= ~(StackAlign - 1); + return StackProbeSize ? StackProbeSize : StackAlign; +} + +unsigned AArch64TargetLowering::getStackProbeMaxUnprobedStack( + MachineFunction &MF) const { + // We assume and guarantee that, at an ABI boundary, the last probe was no + // more than 1024 bytes above SP. + return 1024; +} + Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { // Android provides a fixed TLS slot for the SafeStack pointer. See the // definition of TLS_SLOT_SAFESTACK in diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -311,6 +311,14 @@ static void decomposeStackOffsetForDwarfOffsets(const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized); + + /// Insert code to set SP to the value in TargetReg, ensuring that memory is + /// writen to every NegProbeSize bytes. TargetReg must be below SP, and has no + /// alignment requirements other then the usual 16-byte alignment for SP. + MachineBasicBlock::iterator + insertStackProbingLoop(MachineBasicBlock::iterator MBBI, int64_t NegProbeSize, + Register TargetReg) const; + #define GET_INSTRINFO_HELPER_DECLS #include "AArch64GenInstrInfo.inc" diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -7215,6 +7215,90 @@ return AArch64::BLR; } +MachineBasicBlock::iterator +AArch64InstrInfo::insertStackProbingLoop(MachineBasicBlock::iterator MBBI, + int64_t NegProbeSize, + Register TargetReg) const { + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineFunction &MF = *MBB.getParent(); + const AArch64InstrInfo *TII = + MF.getSubtarget().getInstrInfo(); + DebugLoc DL = MBB.findDebugLoc(MBBI); + + MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator()); + MachineBasicBlock *LoopTestMBB = + MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, LoopTestMBB); + MachineBasicBlock *LoopBodyMBB = + MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, LoopBodyMBB); + MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, ExitMBB); + + // LoopTest: + // SUB SP, SP, #ProbeSize + emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP, + AArch64::SP, StackOffset::getFixed(NegProbeSize), TII, + MachineInstr::FrameSetup); + + // CMP SP, TargetReg + BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64), + AArch64::XZR) + .addReg(AArch64::SP) + .addReg(TargetReg) + .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0)) + .setMIFlags(MachineInstr::FrameSetup); + + // B.LE LoopExit + BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc)) + .addImm(AArch64CC::LE) + .addMBB(ExitMBB) + .setMIFlags(MachineInstr::FrameSetup); + + // STR XZR, [SP] + BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(AArch64::SP) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); + + // B loop + BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B)) + .addMBB(LoopTestMBB); + + // LoopExit: + // MOV SP, TargetReg + BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP) + .addReg(TargetReg) + .addImm(0) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) + .setMIFlags(MachineInstr::FrameSetup); + + // STR XZR, [SP] + BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(AArch64::SP) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); + + LoopTestMBB->addSuccessor(ExitMBB); + LoopTestMBB->addSuccessor(LoopBodyMBB); + LoopBodyMBB->addSuccessor(LoopTestMBB); + // Synthesize the exit MBB. + ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end()); + ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB); + MBB.addSuccessor(LoopTestMBB); + + // Update liveins. + if (MF.getRegInfo().reservedRegsFrozen()) { + recomputeLiveIns(*LoopTestMBB); + recomputeLiveIns(*LoopBodyMBB); + recomputeLiveIns(*ExitMBB); + } + + return ExitMBB->begin(); +} + #define GET_INSTRINFO_HELPERS #define GET_INSTRMAP_INFO #include "AArch64GenInstrInfo.inc" diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -653,6 +653,21 @@ def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), [(AArch64callseq_end timm:$amt1, timm:$amt2)]>, Sched<[]>; + +// Probed stack allocation of a constant size, used in function prologues when +// stack-clash protection is enabled. +def PROBED_STACKALLOC : Pseudo<(outs GPR64:$scratch), + (ins i64imm:$stacksize), + []>, + Sched<[]>; +// Probed stack allocation of a variable size, used in function prologues when +// stack-clash protection is enabled. The register input is the target SP, +// which should be below the current value, and has no alignment requirements +// beyond the usual 16-byte alignment for SP. +def PROBED_STACKALLOC_VAR : Pseudo<(outs), + (ins GPR64:$target), + []>, + Sched<[]>; } // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 let isReMaterializable = 1, isCodeGenOnly = 1 in { diff --git a/llvm/test/CodeGen/AArch64/stack-probing-64k.ll b/llvm/test/CodeGen/AArch64/stack-probing-64k.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/stack-probing-64k.ll @@ -0,0 +1,289 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s +; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -global-isel | FileCheck %s + +; Tests for prolog sequences for stack probing, when using a 64KiB stack guard. + +; Small stack frame, no probing required. +define void @static_64(i8** %out) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" { +; CHECK-LABEL: static_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #64 // =64 +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #64 // =64 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 64, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; At 256 bytes we start to always create a frame pointer. No frame smaller then +; this needs a probe, so we can use the saving of at least one CSR as a probe +; at the top of our frame. +define void @static_256(i8** %out) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" { +; CHECK-LABEL: static_256: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #272 // =272 +; CHECK-NEXT: str x29, [sp, #256] // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 272 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #272 // =272 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 256, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; At just less that 1024 bytes, this is the largest frame which doesn't need +; probing. +define void @static_1008(i8** %out) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" { +; CHECK-LABEL: static_1008: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1008 // =1008 +; CHECK-NEXT: .cfi_def_cfa_offset 1024 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #1008 // =1008 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 1008, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; At 1024 bytes, we need to start probing to guarantee that SP does not go too +; far into the guard at an ABI boundary. +define void @static_1024(i8** %out) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" { +; CHECK-LABEL: static_1024: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 // =1024 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: .cfi_def_cfa_offset 1040 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #1024 // =1024 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 1024, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; The stack offset does not fit into one SUB unstruction, so two are used. +define void @static_65520(i8** %out) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" { +; CHECK-LABEL: static_65520: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #15, lsl #12 // =61440 +; CHECK-NEXT: sub sp, sp, #4080 // =4080 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: .cfi_def_cfa_offset 65536 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #15, lsl #12 // =61440 +; CHECK-NEXT: add sp, sp, #4080 // =4080 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 65520, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + + +; 64k bytes is the largest frame we can probe in one go. +define void @static_65536(i8** %out) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" { +; CHECK-LABEL: static_65536: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: .cfi_def_cfa_offset 65552 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 65536, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; Smallest stack frame (64k+16) which needs two probes (the first, smaller one gets +; folded into a store with writeback). +define void @static_65552(i8** %out) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" { +; CHECK-LABEL: static_65552: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: str xzr, [sp, #-16]! +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: .cfi_def_cfa_offset 65568 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: add sp, sp, #16 // =16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 65552, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; Largest frame needing two probes. +define void @static_131072(i8** %out) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" { +; CHECK-LABEL: static_131072: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: .cfi_def_cfa_offset 131088 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #32, lsl #12 // =131072 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 131072, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; Largest frame probed without a loop (3 probes). +define void @static_196592(i8** %out) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" { +; CHECK-LABEL: static_196592: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #15, lsl #12 // =61440 +; CHECK-NEXT: sub sp, sp, #4080 // =4080 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: .cfi_def_cfa_offset 196608 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #47, lsl #12 // =192512 +; CHECK-NEXT: add sp, sp, #4080 // =4080 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 196592, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; Smallest frame probed with a loop. +define void @static_196608(i8** %out) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" { +; CHECK-LABEL: static_196608: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: sub x9, sp, #48, lsl #12 // =196608 +; CHECK-NEXT: .LBB9_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b.ne .LBB9_1 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: .cfi_def_cfa_offset 196624 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #48, lsl #12 // =196608 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 196608, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; Large enough to use a loop, but not a miltiple of 64KiB so needs an extra +; probe for the remainder. +define void @static_197632(i8** %out) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" { +; CHECK-LABEL: static_197632: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 // =1024 +; CHECK-NEXT: sub x9, sp, #48, lsl #12 // =196608 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: .LBB10_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b.ne .LBB10_1 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: .cfi_def_cfa_offset 197648 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #48, lsl #12 // =196608 +; CHECK-NEXT: add sp, sp, #1024 // =1024 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 197632, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; A small allocation, but with a very large alignment requirement. We do this +; by moving SP far enough that a sufficiently-aligned block will exist +; somewhere in the stack frame, so must probe the whole of that larger SP move. +define void @static_16_align_8192(i8** %out) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" { +; CHECK-LABEL: static_16_align_8192: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: sub x9, x9, #4080 // =4080 +; CHECK-NEXT: and x9, x9, #0xffffffffffffe000 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .LBB11_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB11_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB11_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB11_1 +; CHECK-NEXT: .LBB11_3: // %entry +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 16, align 8192 + store i8* %vla, i8** %out, align 8 + ret void +} diff --git a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll @@ -0,0 +1,351 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s +; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s + +; Test prolog sequences for stack probing when SVE objects are involved. + +; An SVE stack slot needs probing, because we don't know it's size at +; compile-time. +define void @sve_1_vector(** %out) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" { +; CHECK-LABEL: sve_1_vector: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl x9, sp, #-1 +; CHECK-NEXT: .LBB0_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB0_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB0_1 +; CHECK-NEXT: .LBB0_3: // %entry +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %vec = alloca , align 16 + ret void +} + +; As above, but with 4 SVE vectors of stack space. +define void @sve_4_vector(** %out) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" { +; CHECK-LABEL: sve_4_vector: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl x9, sp, #-4 +; CHECK-NEXT: .LBB1_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB1_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB1_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB1_1 +; CHECK-NEXT: .LBB1_3: // %entry +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #4 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %vec1 = alloca , align 16 + %vec2 = alloca , align 16 + %vec3 = alloca , align 16 + %vec4 = alloca , align 16 + ret void +} + +; The area allocated to save callee-saved SVE registers does not need to be +; probed, because it will always be written to, which acts as a probe. +define void @sve_1v_csr( %a) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" { +; CHECK-LABEL: sve_1v_csr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + call void asm sideeffect "", "~{z8}" () + ret void +} + +define void @sve_4v_csr( %a) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" { +; CHECK-LABEL: sve_4v_csr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-4 +; CHECK-NEXT: str z11, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr z11, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #4 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + call void asm sideeffect "", "~{z8},~{z9},~{z10},~{z11}" () + ret void +} + +define void @sve_1p_csr( %a) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" { +; CHECK-LABEL: sve_1p_csr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + call void asm sideeffect "", "~{p8}" () + ret void +} + +define void @sve_4p_csr( %a) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" { +; CHECK-LABEL: sve_4p_csr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p11, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p10, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p9, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr p11, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + call void asm sideeffect "", "~{p8},~{p9},~{p10},~{p11}" () + ret void +} + +; 1 SVE vector, which needs probing, and a 16-byte fixed size object, which +; doesn't. Here the final store of the SVE probing loop gets merged with the +; fixed-size SP decrement, but this doesn't affect probing as the pattern of +; memory access is the same. +define void @sve_1_vector_16_arr(** %out) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" { +; CHECK-LABEL: sve_1_vector_16_arr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl x9, sp, #-1 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .LBB6_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB6_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB6_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB6_1 +; CHECK-NEXT: .LBB6_3: // %entry +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: str xzr, [sp], #-16 +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 // =16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %vec = alloca , align 16 + %arr = alloca i8, i64 16, align 1 + ret void +} + +; 1 SVE stack slot and a 4096-byte stack slot, both of which need probing. +; TODO: This could be optimised by combining the fixed-size offset into the +; loop. +define void @sve_1_vector_4096_arr(** %out) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" { +; CHECK-LABEL: sve_1_vector_4096_arr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl x9, sp, #-1 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .LBB7_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB7_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB7_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB7_1 +; CHECK-NEXT: .LBB7_3: // %entry +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %vec = alloca , align 16 + %arr = alloca i8, i64 4096, align 1 + ret void +} + +; 1 SVE stack slot and a large stack slot, both of which need probing. +; TODO this could be optimised by combining both loops. +define void @sve_1_vector_12288_arr(** %out) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" { +; CHECK-LABEL: sve_1_vector_12288_arr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl x9, sp, #-1 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .LBB8_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB8_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB8_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB8_1 +; CHECK-NEXT: .LBB8_3: // %entry +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: sub x9, sp, #3, lsl #12 // =12288 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: .LBB8_4: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b.ne .LBB8_4 +; CHECK-NEXT: // %bb.5: // %entry +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #3, lsl #12 // =12288 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %vec = alloca , align 16 + %arr = alloca i8, i64 12288, align 1 + ret void +} + +; Not tested: SVE stack objects with alignment >16 bytes, which isn't currently +; supported even without stack-probing. + +; 1 SVE vector, which needs probing, and a 16-byte fixed size object, which +; has a large alignment requirement so also needs a probing loop. +define void @sve_1_vector_16_arr_align_8192(** %out) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" { +; CHECK-LABEL: sve_1_vector_16_arr_align_8192: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: addvl x9, sp, #-1 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .LBB9_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB9_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB9_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB9_1 +; CHECK-NEXT: .LBB9_3: // %entry +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: sub x9, x9, #4080 // =4080 +; CHECK-NEXT: and x9, x9, #0xffffffffffffe000 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: .LBB9_4: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB9_6 +; CHECK-NEXT: // %bb.5: // %entry +; CHECK-NEXT: // in Loop: Header=BB9_4 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB9_4 +; CHECK-NEXT: .LBB9_6: // %entry +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret +entry: + %vec = alloca , align 16 + %arr = alloca i8, i64 16, align 8192 + ret void +} + +; For 64k guard pages, the only difference is the constant subtracted from SP +; in the loop. +define void @sve_64k_guard(** %out) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" "stack-probe-size"="65536" { +; CHECK-LABEL: sve_64k_guard: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl x9, sp, #-1 +; CHECK-NEXT: .LBB10_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB10_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB10_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB10_1 +; CHECK-NEXT: .LBB10_3: // %entry +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %vec = alloca , align 16 + ret void +} + +; Not tested: dynamic allocations of SVE vectors, which don't currently work +; without stack probing. diff --git a/llvm/test/CodeGen/AArch64/stack-probing.ll b/llvm/test/CodeGen/AArch64/stack-probing.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/stack-probing.ll @@ -0,0 +1,291 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s +; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -global-isel | FileCheck %s + +; Tests for prolog sequences for stack probing, when using a 4KiB stack guard. + +; Small stack frame, no probing required. +define void @static_64(i8** %out) "probe-stack"="inline-asm" "frame-pointer"="none" { +; CHECK-LABEL: static_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #64 // =64 +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #64 // =64 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 64, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; At 256 bytes we start to always create a frame pointer. No frame smaller then +; this needs a probe, so we can use the saving of at least one CSR as a probe +; at the top of our frame. +define void @static_256(i8** %out) "probe-stack"="inline-asm" "frame-pointer"="none" { +; CHECK-LABEL: static_256: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #272 // =272 +; CHECK-NEXT: str x29, [sp, #256] // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 272 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #272 // =272 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 256, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; At just less that 1024 bytes, this is the largest frame which doesn't need +; probing. +define void @static_1008(i8** %out) "probe-stack"="inline-asm" "frame-pointer"="none" { +; CHECK-LABEL: static_1008: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1008 // =1008 +; CHECK-NEXT: .cfi_def_cfa_offset 1024 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #1008 // =1008 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 1008, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; At 1024 bytes, we need to start probing to guarantee that SP does not go too +; far into the guard at an ABI boundary. +define void @static_1024(i8** %out) "probe-stack"="inline-asm" "frame-pointer"="none" { +; CHECK-LABEL: static_1024: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 // =1024 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: .cfi_def_cfa_offset 1040 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #1024 // =1024 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 1024, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 4096 bytes is the largest frame we can probe in one go. +define void @static_4096(i8** %out) "probe-stack"="inline-asm" "frame-pointer"="none" { +; CHECK-LABEL: static_4096: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: .cfi_def_cfa_offset 4112 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 4096, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; Smallest stack frame which needs two probes (the first, smaller one gets +; folded into a store with writeback). +define void @static_4112(i8** %out) "probe-stack"="inline-asm" "frame-pointer"="none" { +; CHECK-LABEL: static_4112: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: str xzr, [sp, #-16]! +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: .cfi_def_cfa_offset 4128 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: add sp, sp, #16 // =16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 4112, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; Needs two probes, but neither can be folded into a store with writeback. +define void @static_6144(i8** %out) "probe-stack"="inline-asm" "frame-pointer"="none" { +; CHECK-LABEL: static_6144: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #2048 // =2048 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: .cfi_def_cfa_offset 6160 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: add sp, sp, #2048 // =2048 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 6144, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; Largest frame needing two probes +define void @static_8192(i8** %out) "probe-stack"="inline-asm" "frame-pointer"="none" { +; CHECK-LABEL: static_8192: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: .cfi_def_cfa_offset 8208 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #2, lsl #12 // =8192 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 8192, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; Largest frame probed without a loop +define void @static_12272(i8** %out) "probe-stack"="inline-asm" "frame-pointer"="none" { +; CHECK-LABEL: static_12272: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #4080 // =4080 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: .cfi_def_cfa_offset 12288 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #2, lsl #12 // =8192 +; CHECK-NEXT: add sp, sp, #4080 // =4080 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 12272, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; Smallest frame probed with a loop +define void @static_12288(i8** %out) "probe-stack"="inline-asm" "frame-pointer"="none" { +; CHECK-LABEL: static_12288: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: sub x9, sp, #3, lsl #12 // =12288 +; CHECK-NEXT: .LBB9_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b.ne .LBB9_1 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: .cfi_def_cfa_offset 12304 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #3, lsl #12 // =12288 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 12288, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; Large enough to use a loop, but not a multiple of 4KiB so needs an extra +; probe for the remainder. +define void @static_13312(i8** %out) "probe-stack"="inline-asm" "frame-pointer"="none" { +; CHECK-LABEL: static_13312: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 // =1024 +; CHECK-NEXT: sub x9, sp, #3, lsl #12 // =12288 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: .LBB10_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b.ne .LBB10_1 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: .cfi_def_cfa_offset 13328 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #3, lsl #12 // =12288 +; CHECK-NEXT: add sp, sp, #1024 // =1024 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 13312, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; A small allocation, but with a very large alignment requirement. We do this +; by moving SP far enough that a sufficiently-aligned block will exist +; somewhere in the stack frame, so must probe the whole of that larger SP move. +define void @static_16_align_8192(i8** %out) "probe-stack"="inline-asm" "frame-pointer"="none" { +; CHECK-LABEL: static_16_align_8192: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: sub x9, x9, #4080 // =4080 +; CHECK-NEXT: and x9, x9, #0xffffffffffffe000 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .LBB11_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB11_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB11_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB11_1 +; CHECK-NEXT: .LBB11_3: // %entry +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 16, align 8192 + store i8* %vla, i8** %out, align 8 + ret void +} + + +