Index: llvm/lib/Target/AArch64/AArch64FrameLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -157,10 +157,28 @@ MachineBasicBlock::iterator MBBI) const; void emitCalleeSavedSVERestores(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const; + void allocateSVEStackSpace(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + StackOffset AllocSize, StackOffset InitialOffset, + bool EmitCFI) const; /// Emit target zero call-used regs. void emitZeroCallUsedRegs(BitVector RegsToZero, MachineBasicBlock &MBB) const override; + + /// Replace a StackProbe stub (if any) with the actual probe code inline + void inlineStackProbe(MachineFunction &MF, + MachineBasicBlock &PrologueMBB) const override; + MachineBasicBlock::iterator + inlineStackProbeFixed(MachineBasicBlock::iterator MBBI) const; + + MachineBasicBlock::iterator + inlineStackProbeVar(MachineBasicBlock::iterator MBBI) const; + + MachineBasicBlock::iterator + inlineStackProbeLoopExactMultiple(MachineBasicBlock::iterator MBBI, + int64_t NegProbeSize, + Register TargetReg) const; }; } // End llvm namespace Index: llvm/lib/Target/AArch64/AArch64FrameLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -301,6 +301,7 @@ static bool needsWinCFI(const MachineFunction &MF); static StackOffset getSVEStackSize(const MachineFunction &MF); static bool needsShadowCallStackPrologueEpilogue(MachineFunction &MF); +static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB); /// Returns true if a homogeneous prolog or epilog code can be emitted /// for the size optimization. If possible, a frame helper call is injected. @@ -671,6 +672,74 @@ emitCalleeSavedRestores(MBB, MBBI, true); } +void AArch64FrameLowering::allocateSVEStackSpace( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + StackOffset AllocSize, StackOffset InitialOffset, bool EmitCFI) const { + DebugLoc DL; + MachineFunction &MF = *MBB.getParent(); + const AArch64Subtarget &Subtarget = MF.getSubtarget(); + const AArch64RegisterInfo &RegInfo = *Subtarget.getRegisterInfo(); + const AArch64TargetLowering &TLI = *Subtarget.getTargetLowering(); + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + + // If not probing the stack or the (uknown) allocation size is less than the + // probe size decrement the stack pointer right away. This avoids having to + // emit a probing loop when allocating space for up to 16 SVE registers when + // using 4k probes. + + // The bit-length of SVE registers is architecturally limited. + const int64_t MAX_BYTES_PER_SCALABLE_BYTE = 16; + int64_t ProbeSize = TLI.getStackProbeSize(MF); + if (!TLI.hasInlineStackProbe(MF) || + AllocSize.getScalable() * MAX_BYTES_PER_SCALABLE_BYTE + + AllocSize.getFixed() <= + ProbeSize) { + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -AllocSize, &TII, + MachineInstr::FrameSetup, false, false, nullptr, EmitCFI, + InitialOffset); + if (TLI.hasInlineStackProbe(MF)) { + // Issue a probe at the top of the stack to prepare for subsequent + // allocations. + // STR XZR, [TargetReg] + BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(AArch64::SP) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); + } + return; + } + + // If we can't be sure the allocation size if less than the probe size, we + // have to emit a stack probing loop. + Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB); + assert(ScratchReg != AArch64::NoRegister); + // Get the new top of the stack into a scratch register. + emitFrameOffset(MBB, MBBI, DL, ScratchReg, AArch64::SP, -AllocSize, &TII, + MachineInstr::FrameSetup, false, false, nullptr, EmitCFI, + InitialOffset); + // Arrange to emit a probing loop by decrementing SP until it reaches that + // new top of the stack. + BuildMI(MBB, MBBI, DL, TII.get(AArch64::PROBED_STACKALLOC_VAR), AArch64::SP) + .addReg(ScratchReg); + // Set SP to its new value. + // MOV SP, Xs + BuildMI(MBB, MBBI, DL, TII.get(AArch64::ADDXri), AArch64::SP) + .addReg(ScratchReg) + .addImm(0) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) + .setMIFlags(MachineInstr::FrameSetup); + if (EmitCFI) { + // Set the CFA register back to SP. + unsigned Reg = RegInfo.getDwarfRegNum(AArch64::SP, true); + unsigned CFIIndex = + MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, Reg)); + BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); + } +} + static MCRegister getRegisterOrZero(MCRegister Reg, bool HasSVE) { switch (Reg.id()) { default: @@ -854,9 +923,11 @@ MachineBasicBlock *TmpMBB = const_cast(&MBB); const AArch64Subtarget &Subtarget = MF->getSubtarget(); const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + const AArch64TargetLowering *TLI = Subtarget.getTargetLowering(); - // Don't need a scratch register if we're not going to re-align the stack. - if (!RegInfo->hasStackRealignment(*MF)) + // Don't need a scratch register if we're not going to re-align the stack or + // emit stack probes. + if (!RegInfo->hasStackRealignment(*MF) && TLI->hasInlineStackProbe(*MF)) return true; // Otherwise, we can use any block as long as it has a scratch register // available. @@ -1428,6 +1499,7 @@ const Function &F = MF.getFunction(); const AArch64Subtarget &Subtarget = MF.getSubtarget(); const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + const AArch64TargetLowering &TLI = *Subtarget.getTargetLowering(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineModuleInfo &MMI = MF.getMMI(); AArch64FunctionInfo *AFI = MF.getInfo(); @@ -1783,12 +1855,14 @@ } } - StackOffset AllocateBefore = SVEStackSize, AllocateAfter = {}; + StackOffset SVECalleeSavedSize = {}, SVELocalsSize = SVEStackSize; MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI; // Process the SVE callee-saves to determine what space needs to be // allocated. if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) { + LLVM_DEBUG(dbgs() << "SVECalleeSavedStackSize = " << CalleeSavedSize + << "\n"); // Find callee save instructions in frame. CalleeSavesBegin = MBBI; assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction"); @@ -1796,33 +1870,40 @@ ++MBBI; CalleeSavesEnd = MBBI; - AllocateBefore = StackOffset::getScalable(CalleeSavedSize); - AllocateAfter = SVEStackSize - AllocateBefore; + SVECalleeSavedSize = StackOffset::getScalable(CalleeSavedSize); + SVELocalsSize = SVEStackSize - SVECalleeSavedSize; + + // Allocate space for the SVE callee saves. + if (SVECalleeSavedSize) { + allocateSVEStackSpace( + MBB, CalleeSavesBegin, SVECalleeSavedSize, + StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes), + EmitAsyncCFI && !HasFP); + if (EmitAsyncCFI) + emitCalleeSavedSVELocations(MBB, CalleeSavesEnd); + } } - // Allocate space for the callee saves (if any). - emitFrameOffset( - MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP, -AllocateBefore, TII, - MachineInstr::FrameSetup, false, false, nullptr, - EmitAsyncCFI && !HasFP && AllocateBefore, - StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes)); - - if (EmitAsyncCFI) - emitCalleeSavedSVELocations(MBB, CalleeSavesEnd); - - // Finally allocate remaining SVE stack space. - emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP, - -AllocateAfter, TII, MachineInstr::FrameSetup, false, false, - nullptr, EmitAsyncCFI && !HasFP && AllocateAfter, - AllocateBefore + StackOffset::getFixed( - (int64_t)MFI.getStackSize() - NumBytes)); + // Allocate stack space for the local SVE objects. + if (SVELocalsSize) + allocateSVEStackSpace( + MBB, CalleeSavesEnd, SVELocalsSize, + SVECalleeSavedSize + + StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes), + EmitAsyncCFI && !HasFP); // Allocate space for the rest of the frame. if (NumBytes) { unsigned scratchSPReg = AArch64::SP; + bool NeedsStackProbe = TLI.hasInlineStackProbe(MF) && + (NumBytes > AArch64::StackProbeMaxUnprobedStack || + MFI.hasVarSizedObjects()); if (NeedsRealignment) { scratchSPReg = findScratchNonCalleeSaveRegister(&MBB); + NeedsStackProbe |= TLI.hasInlineStackProbe(MF) && + (NumBytes + MFI.getMaxAlign().value()) > + AArch64::StackProbeMaxUnprobedStack; assert(scratchSPReg != AArch64::NoRegister); } @@ -1831,12 +1912,25 @@ // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have // the correct value here, as NumBytes also includes padding bytes, // which shouldn't be counted here. - emitFrameOffset( - MBB, MBBI, DL, scratchSPReg, AArch64::SP, - StackOffset::getFixed(-NumBytes), TII, MachineInstr::FrameSetup, - false, NeedsWinCFI, &HasWinCFI, EmitAsyncCFI && !HasFP, + StackOffset CFAOffset = SVEStackSize + - StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes)); + StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes); + if (NeedsStackProbe && !NeedsRealignment) { + // If we don't need to re-align the stack, we can use a more efficient + // sequence for stack probing. + Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB); + assert(ScratchReg != AArch64::NoRegister); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::PROBED_STACKALLOC)) + .addDef(ScratchReg) + .addImm(NumBytes) + .addImm(CFAOffset.getFixed()) + .addImm(CFAOffset.getScalable()); + } else { + emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, + StackOffset::getFixed(-NumBytes), TII, + MachineInstr::FrameSetup, false, NeedsWinCFI, + &HasWinCFI, EmitAsyncCFI && !HasFP, CFAOffset); + } } if (NeedsRealignment) { assert(MFI.getMaxAlign() > Align(1)); @@ -1845,12 +1939,48 @@ // SUB X9, SP, NumBytes // -- X9 is temporary register, so shouldn't contain any live data here, // -- free to use. This is already produced by emitFrameOffset above. - // AND SP, X9, 0b11111...0000 - uint64_t AndMask = ~(MFI.getMaxAlign().value() - 1); - BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP) - .addReg(scratchSPReg, RegState::Kill) - .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64)); + const uint64_t MaxAlign = MFI.getMaxAlign().value(); + const uint64_t AndMask = ~(MaxAlign - 1); + + if (NeedsStackProbe) { + // If allocation size is known to not exceed the probe size, don't emit + // a probing loop. + if (NumBytes + MaxAlign - 1 <= TLI.getStackProbeSize(MF)) { + // AND SP, X9, 0b11111...0000 + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP) + .addReg(scratchSPReg, RegState::Kill) + .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64)) + .setMIFlags(MachineInstr::FrameSetup); + // STR XZR, [SP] + BuildMI(MBB, MBBI, DL, TII->get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(AArch64::SP) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); + } else { + // AND X9, X9, 0b11111...0000 + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), scratchSPReg) + .addReg(scratchSPReg, RegState::Kill) + .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64)) + .setMIFlags(MachineInstr::FrameSetup); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::PROBED_STACKALLOC_VAR), + AArch64::SP) + .addReg(scratchSPReg); + // MOV SP, X9 + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), AArch64::SP) + .addReg(scratchSPReg) + .addImm(0) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) + .setMIFlags(MachineInstr::FrameSetup); + } + } else { + // AND SP, X9, 0b11111...0000 + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP) + .addReg(scratchSPReg, RegState::Kill) + .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64)) + .setMIFlags(MachineInstr::FrameSetup); + } AFI->setStackRealigned(true); // No need for SEH instructions here; if we're realigning the stack, @@ -2809,7 +2939,8 @@ } return true; } - for (const RegPairInfo &RPI : llvm::reverse(RegPairs)) { + + auto EmitMI = [&](const RegPairInfo &RPI) { unsigned Reg1 = RPI.Reg1; unsigned Reg2 = RPI.Reg2; unsigned StrOpc; @@ -2828,30 +2959,30 @@ Align Alignment; switch (RPI.Type) { case RegPairInfo::GPR: - StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui; - Size = 8; - Alignment = Align(8); - break; + StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui; + Size = 8; + Alignment = Align(8); + break; case RegPairInfo::FPR64: - StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui; - Size = 8; - Alignment = Align(8); - break; + StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui; + Size = 8; + Alignment = Align(8); + break; case RegPairInfo::FPR128: - StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui; - Size = 16; - Alignment = Align(16); - break; + StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui; + Size = 16; + Alignment = Align(16); + break; case RegPairInfo::ZPR: - StrOpc = AArch64::STR_ZXI; - Size = 16; - Alignment = Align(16); - break; + StrOpc = AArch64::STR_ZXI; + Size = 16; + Alignment = Align(16); + break; case RegPairInfo::PPR: - StrOpc = AArch64::STR_PXI; - Size = 2; - Alignment = Align(2); - break; + StrOpc = AArch64::STR_PXI; + Size = 2; + Alignment = Align(2); + break; } LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI); if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI); @@ -2896,8 +3027,11 @@ MachineFrameInfo &MFI = MF.getFrameInfo(); if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR) MFI.setStackID(RPI.FrameIdx, TargetStackID::ScalableVector); + }; + + for (const RegPairInfo &RPI : llvm::reverse(RegPairs)) + EmitMI(RPI); - } return true; } @@ -4040,3 +4174,181 @@ dbgs() << "\n"; }); } + +/// Emit a loop to decrement SP until it is equal to TargetReg, with probes at +/// least every ProbeSize bytes. Returns an iterator of the first instruction +/// after the loop. The difference between SP and TargetReg must be an exact +/// multiple of ProbeSize. +MachineBasicBlock::iterator +AArch64FrameLowering::inlineStackProbeLoopExactMultiple( + MachineBasicBlock::iterator MBBI, int64_t ProbeSize, + Register TargetReg) const { + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineFunction &MF = *MBB.getParent(); + const AArch64InstrInfo *TII = + MF.getSubtarget().getInstrInfo(); + DebugLoc DL = MBB.findDebugLoc(MBBI); + + MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator()); + MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, LoopMBB); + MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, ExitMBB); + + // SUB SP, SP, #ProbeSize (or equivalent if ProbeSize is not encodable + // in SUB). + emitFrameOffset(*LoopMBB, LoopMBB->end(), DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(-ProbeSize), TII, + MachineInstr::FrameSetup); + // STR XZR, [SP] + BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(AArch64::SP) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); + // CMP SP, TargetReg + BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::SUBSXrx64), + AArch64::XZR) + .addReg(AArch64::SP) + .addReg(TargetReg) + .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0)) + .setMIFlags(MachineInstr::FrameSetup); + // B.CC Loop + BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::Bcc)) + .addImm(AArch64CC::NE) + .addMBB(LoopMBB) + .setMIFlags(MachineInstr::FrameSetup); + + LoopMBB->addSuccessor(ExitMBB); + LoopMBB->addSuccessor(LoopMBB); + // Synthesize the exit MBB. + ExitMBB->splice(ExitMBB->end(), &MBB, MBBI, MBB.end()); + ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB); + MBB.addSuccessor(LoopMBB); + // Update liveins. + recomputeLiveIns(*LoopMBB); + recomputeLiveIns(*ExitMBB); + + return ExitMBB->begin(); +} + +MachineBasicBlock::iterator AArch64FrameLowering::inlineStackProbeFixed( + MachineBasicBlock::iterator MBBI) const { + MachineBasicBlock *MBB = MBBI->getParent(); + MachineFunction &MF = *MBB->getParent(); + const AArch64TargetLowering *TLI = + MF.getSubtarget().getTargetLowering(); + const AArch64InstrInfo *TII = + MF.getSubtarget().getInstrInfo(); + AArch64FunctionInfo *AFI = MF.getInfo(); + bool EmitAsyncCFI = AFI->needsAsyncDwarfUnwindInfo(MF); + bool HasFP = hasFP(MF); + + DebugLoc DL = MBB->findDebugLoc(MBBI); + Register ScratchReg = MBBI->getOperand(0).getReg(); + int64_t FrameSize = MBBI->getOperand(1).getImm(); + StackOffset CFAOffset = StackOffset::get(MBBI->getOperand(2).getImm(), + MBBI->getOperand(3).getImm()); + int64_t ProbeSize = TLI->getStackProbeSize(MF); + int64_t NumBlocks = FrameSize / ProbeSize; + int64_t ResidualSize = FrameSize % ProbeSize; + + LLVM_DEBUG(dbgs() << "Stack probing: total " << FrameSize << " bytes, " + << NumBlocks << " blocks of " << ProbeSize + << " bytes, plus " << ResidualSize << " bytes\n"); + + // Decrement SP by NumBlock * ProbeSize bytes, with either unrolled or + // ordinary loop. + if (NumBlocks <= AArch64::StackProbeMaxLoopUnroll) { + for (int i = 0; i < NumBlocks; ++i) { + // SUB SP, SP, #FrameSize (or equivalent if FrameSize is not + // encodable in a SUB). + emitFrameOffset(*MBB, MBBI, DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(-ProbeSize), TII, + MachineInstr::FrameSetup, false, false, nullptr, + EmitAsyncCFI && !HasFP, CFAOffset); + CFAOffset += StackOffset::getFixed(ProbeSize); + // STR XZR, [SP] + BuildMI(*MBB, MBBI, DL, TII->get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(AArch64::SP) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); + } + } else if (NumBlocks != 0) { + // SUB ScratchReg, SP, #FrameSize (or equivalent if FrameSize is not + // encodable in ADD). ScrathReg may temporarily become the CFA register. + emitFrameOffset(*MBB, MBBI, DL, ScratchReg, AArch64::SP, + StackOffset::getFixed(-ProbeSize * NumBlocks), TII, + MachineInstr::FrameSetup, false, false, nullptr, + EmitAsyncCFI && !HasFP, CFAOffset); + CFAOffset += StackOffset::getFixed(ProbeSize * NumBlocks); + MBBI = inlineStackProbeLoopExactMultiple(MBBI, ProbeSize, ScratchReg); + MBB = MBBI->getParent(); + if (EmitAsyncCFI && !HasFP) { + // Set the CFA register back to SP. + const AArch64RegisterInfo &RegInfo = + *MF.getSubtarget().getRegisterInfo(); + unsigned Reg = RegInfo.getDwarfRegNum(AArch64::SP, true); + unsigned CFIIndex = + MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, Reg)); + BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); + } + } + + if (ResidualSize != 0) { + // SUB SP, SP, #ResidualSize (or equivalent if ResidualSize is not encodable + // in SUB). + emitFrameOffset(*MBB, MBBI, DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(-ResidualSize), TII, + MachineInstr::FrameSetup, false, false, nullptr, + EmitAsyncCFI && !HasFP, CFAOffset); + if (ResidualSize > AArch64::StackProbeMaxUnprobedStack) { + // STR XZR, [SP] + BuildMI(*MBB, MBBI, DL, TII->get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(AArch64::SP) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); + } + } + + MachineBasicBlock::iterator Next = std::next(MBBI); + MBBI->eraseFromParent(); + return Next; +} + +MachineBasicBlock::iterator AArch64FrameLowering::inlineStackProbeVar( + MachineBasicBlock::iterator MBBI) const { + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineFunction &MF = *MBB.getParent(); + const AArch64InstrInfo *TII = + MF.getSubtarget().getInstrInfo(); + + DebugLoc DL = MBB.findDebugLoc(MBBI); + Register ScratchReg = MBBI->getOperand(0).getReg(); + Register TargetReg = MBBI->getOperand(1).getReg(); + MachineBasicBlock::iterator NextInst = std::next(MBBI); + + NextInst = TII->insertStackProbingLoop(MBBI, ScratchReg, TargetReg); + + MBBI->eraseFromParent(); + return NextInst; +} + +void AArch64FrameLowering::inlineStackProbe(MachineFunction &MF, + MachineBasicBlock &MBB) const { + for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E;) { + if (MBBI->getOpcode() == AArch64::PROBED_STACKALLOC) { + MBBI = inlineStackProbeFixed(MBBI); + E = MBBI->getParent()->end(); + } else if (MBBI->getOpcode() == AArch64::PROBED_STACKALLOC_VAR) { + MBBI = inlineStackProbeVar(MBBI); + E = MBBI->getParent()->end(); + } else { + ++MBBI; + } + } +} Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -508,6 +508,13 @@ ArrayRef getGPRArgRegs(); ArrayRef getFPRArgRegs(); +/// Maximum allowed number of unprobed bytes above SP at an ABI +/// boundary. +const unsigned StackProbeMaxUnprobedStack = 1024; + +/// Maximum number of iterations to unroll for a constant size probing loop. +const unsigned StackProbeMaxLoopUnroll = 4; + } // namespace AArch64 class AArch64Subtarget; @@ -942,6 +949,13 @@ // used for 64bit and 128bit vectors as well. bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON = false) const; + /// True if stack clash protection is enabled for this functions. + bool hasInlineStackProbe(const MachineFunction &MF) const override; + + /// Get the interval between stack-clash probes, which is equal to the stack + /// guard size, in bytes. + unsigned getStackProbeSize(const MachineFunction &MF) const; + private: /// Keep a pointer to the AArch64Subtarget around so that we can /// make the right decision when generating code for different targets. Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -26088,3 +26088,37 @@ } return true; } + +bool AArch64TargetLowering::hasInlineStackProbe( + const MachineFunction &MF) const { + // If the function specifically requests inline stack probes, emit them. + if (MF.getFunction().hasFnAttribute("probe-stack")) { + if (MF.getFunction().getFnAttribute("probe-stack").getValueAsString() == + "inline-asm") + return true; + else + llvm_unreachable("Unsupported stack probing method"); + } + + return false; +} + +unsigned +AArch64TargetLowering::getStackProbeSize(const MachineFunction &MF) const { + const TargetFrameLowering *TFI = Subtarget->getFrameLowering(); + unsigned StackAlign = TFI->getStackAlignment(); + assert(StackAlign >= 1 && isPowerOf2_32(StackAlign) && + "Unexpected stack alignment"); + // The default stack probe size is 4096 if the function has no + // stack-probe-size attribute. This is a safe default because it is the + // smallest possible guard page size. + unsigned StackProbeSize = 4096; + const Function &Fn = MF.getFunction(); + if (Fn.hasFnAttribute("stack-probe-size")) + Fn.getFnAttribute("stack-probe-size") + .getValueAsString() + .getAsInteger(0, StackProbeSize); + // Round down to the stack alignment. + StackProbeSize &= ~(StackAlign - 1); + return StackProbeSize ? StackProbeSize : StackAlign; +} Index: llvm/lib/Target/AArch64/AArch64InstrInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -347,6 +347,11 @@ static void decomposeStackOffsetForDwarfOffsets(const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized); + + MachineBasicBlock::iterator + insertStackProbingLoop(MachineBasicBlock::iterator MBBI, Register ScratchReg, + Register TargetReg) const; + #define GET_INSTRINFO_HELPER_DECLS #include "AArch64GenInstrInfo.inc" Index: llvm/lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -10,8 +10,8 @@ // //===----------------------------------------------------------------------===// -#include "AArch64ExpandImm.h" #include "AArch64InstrInfo.h" +#include "AArch64ExpandImm.h" #include "AArch64FrameLowering.h" #include "AArch64MachineFunctionInfo.h" #include "AArch64Subtarget.h" @@ -20,6 +20,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineCombinerPattern.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -8461,6 +8462,96 @@ return AArch64::BLR; } +MachineBasicBlock::iterator +AArch64InstrInfo::insertStackProbingLoop(MachineBasicBlock::iterator MBBI, + Register ScratchReg, + Register TargetReg) const { + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineFunction &MF = *MBB.getParent(); + const AArch64TargetLowering *TLI = + MF.getSubtarget().getTargetLowering(); + const AArch64InstrInfo *TII = + MF.getSubtarget().getInstrInfo(); + int64_t ProbeSize = (int64_t)TLI->getStackProbeSize(MF); + DebugLoc DL = MBB.findDebugLoc(MBBI); + + MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator()); + MachineBasicBlock *LoopTestMBB = + MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, LoopTestMBB); + MachineBasicBlock *LoopBodyMBB = + MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, LoopBodyMBB); + MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, ExitMBB); + + // LoopTest: + // SUB ScratchReg, ScratchReg, #ProbeSize + emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, ScratchReg, ScratchReg, + StackOffset::getFixed(-ProbeSize), TII, + MachineInstr::FrameSetup); + + // CMP ScratchReg, TargetReg + AArch64CC::CondCode Cond = AArch64CC::LE; + Register Op1 = ScratchReg; + Register Op2 = TargetReg; + if (Op2 == AArch64::SP) { + assert(Op1 != AArch64::SP && "At most one of the registers can be SP"); + // CMP TargetReg, ScratchReg + std::swap(Op1, Op2); + Cond = AArch64CC::GT; + } + BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64), + AArch64::XZR) + .addReg(Op1) + .addReg(Op2) + .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0)) + .setMIFlags(MachineInstr::FrameSetup); + + // B. LoopExit + BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc)) + .addImm(Cond) + .addMBB(ExitMBB) + .setMIFlags(MachineInstr::FrameSetup); + + // STR XZR, [ScratchReg] + BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(ScratchReg) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); + + // B loop + BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B)) + .addMBB(LoopTestMBB) + .setMIFlags(MachineInstr::FrameSetup); + + // LoopExit: + // STR XZR, [TargetReg] + BuildMI(*ExitMBB, ExitMBB->begin(), DL, TII->get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(TargetReg) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); + + ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end()); + ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB); + + LoopTestMBB->addSuccessor(ExitMBB); + LoopTestMBB->addSuccessor(LoopBodyMBB); + LoopBodyMBB->addSuccessor(LoopTestMBB); + MBB.addSuccessor(LoopTestMBB); + + // Update liveins. + if (MF.getRegInfo().reservedRegsFrozen()) { + recomputeLiveIns(*LoopTestMBB); + recomputeLiveIns(*LoopBodyMBB); + recomputeLiveIns(*ExitMBB); + } + + return ExitMBB->begin(); +} + #define GET_INSTRINFO_HELPERS #define GET_INSTRMAP_INFO #include "AArch64GenInstrInfo.inc" Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -886,7 +886,8 @@ // Miscellaneous instructions. //===----------------------------------------------------------------------===// -let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in { +let hasSideEffects = 1, isCodeGenOnly = 1 in { +let Defs = [SP], Uses = [SP] in { // We set Sched to empty list because we expect these instructions to simply get // removed in most cases. def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), @@ -895,7 +896,23 @@ def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), [(AArch64callseq_end timm:$amt1, timm:$amt2)]>, Sched<[]>; -} // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 + +// Probed stack allocation of a constant size, used in function prologues when +// stack-clash protection is enabled. +def PROBED_STACKALLOC : Pseudo<(outs GPR64:$scratch), + (ins i64imm:$stacksize, i64imm:$fixed_offset, + i64imm:$scalable_offset), + []>, + Sched<[]>; +} // Defs = [SP], Uses = [SP] + +// Probed stack allocation of a variable size, used in function prologues when +// stack-clash protection is enabled. +def PROBED_STACKALLOC_VAR : Pseudo<(outs GPR64sp:$scratch), + (ins GPR64sp:$target), + []>, + Sched<[]>; +} //hasSideEffects = 1, isCodeGenOnly = 1 let isReMaterializable = 1, isCodeGenOnly = 1 in { // FIXME: The following pseudo instructions are only needed because remat Index: llvm/test/CodeGen/AArch64/framelayout-sve.mir =================================================================== --- llvm/test/CodeGen/AArch64/framelayout-sve.mir +++ llvm/test/CodeGen/AArch64/framelayout-sve.mir @@ -206,7 +206,7 @@ # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 # CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2 # CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 16, 0 -# CHECK-NEXT: $sp = ANDXri killed $[[TMP]] +# CHECK-NEXT: $sp = frame-setup ANDXri killed $[[TMP]] # CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16 # CHECK-NEXT: $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 @@ -1032,7 +1032,7 @@ # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 # CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1 # CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 16, 0 -# CHECK-NEXT: $sp = ANDXri killed $[[TMP]] +# CHECK-NEXT: $sp = frame-setup ANDXri killed $[[TMP]] # CHECK: $sp = frame-destroy ADDVL_XXI $fp, -18 # CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 4 Index: llvm/test/CodeGen/AArch64/spill-stack-realignment.mir =================================================================== --- llvm/test/CodeGen/AArch64/spill-stack-realignment.mir +++ llvm/test/CodeGen/AArch64/spill-stack-realignment.mir @@ -21,7 +21,7 @@ - { id: 1, size: 4, alignment: 4, local-offset: -68 } # CHECK: body: -# CHECK: $sp = ANDXri killed ${{x[0-9]+}}, 7865 +# CHECK: $sp = frame-setup ANDXri killed ${{x[0-9]+}}, 7865 # CHECK: STRSui $s0, $sp, 0 # CHECK: STRSui $s0, $fp, 7 body: | Index: llvm/test/CodeGen/AArch64/stack-probing-64k.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/stack-probing-64k.ll @@ -0,0 +1,392 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false | FileCheck %s +; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false -global-isel | FileCheck %s + +; Tests for prolog sequences for stack probing, when using a 64KiB stack guard. + +; 64k bytes is the largest frame we can probe in one go. +define void @static_65536(i8** %out) #0 { +; CHECK-LABEL: static_65536: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 65552 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 65536, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 64k+16 bytes, still needs just one probe. +define void @static_65552(ptr %out) #0 { +; CHECK-LABEL: static_65552: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 65552 +; CHECK-NEXT: str xzr, [sp], #-16 +; CHECK-NEXT: .cfi_def_cfa_offset 65568 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 65552, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 64k+1024 bytes, the largest frame which needs just one probe. +define void @static_66560(ptr %out) #0 { +; CHECK-LABEL: static_66560: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 65552 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 66576 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 1040 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 66560, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 64k+1024+16 bytes, the smallest frame which needs two probes. +define void @static_66576(ptr %out) #0 { +; CHECK-LABEL: static_66576: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 65552 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 66592 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 1056 +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 66576, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 2*64k+1024, the largest frame needing two probes. +define void @static_132096(i8** %out) #0 { +; CHECK-LABEL: static_132096: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 65552 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 131088 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 132112 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #32, lsl #12 // =131072 +; CHECK-NEXT: .cfi_def_cfa_offset 1040 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 132096, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 5*64k-16, the largest frame probed without a loop. +define void @static_327664(i8** %out) #0 { +; CHECK-LABEL: static_327664: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 65552 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 131088 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 196624 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 262160 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #15, lsl #12 // =61440 +; CHECK-NEXT: .cfi_def_cfa_offset 323600 +; CHECK-NEXT: sub sp, sp, #4080 +; CHECK-NEXT: .cfi_def_cfa_offset 327680 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #79, lsl #12 // =323584 +; CHECK-NEXT: .cfi_def_cfa_offset 4096 +; CHECK-NEXT: add sp, sp, #4080 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 327664, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 5*64k, smallest frame probed with a loop. +define void @static_327680(i8** %out) #0 { +; CHECK-LABEL: static_327680: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #80, lsl #12 // =327680 +; CHECK-NEXT: .cfi_def_cfa w9, 327696 +; CHECK-NEXT: .LBB6_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.ne .LBB6_1 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #80, lsl #12 // =327680 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 327680, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 5*64k+1024, large enough to use a loop, but not a multiple of 64KiB +; so has a reminder, but no extra probe. +define void @static_328704(i8** %out) #0 { +; CHECK-LABEL: static_328704: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #80, lsl #12 // =327680 +; CHECK-NEXT: .cfi_def_cfa w9, 327696 +; CHECK-NEXT: .LBB7_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.ne .LBB7_1 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 328720 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #80, lsl #12 // =327680 +; CHECK-NEXT: .cfi_def_cfa_offset 1040 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 328704, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 5*64k+1040, large enough to use a loop, has a reminder and +; an extra probe. +define void @static_328720(i8** %out) #0 { +; CHECK-LABEL: static_328720: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #80, lsl #12 // =327680 +; CHECK-NEXT: .cfi_def_cfa w9, 327696 +; CHECK-NEXT: .LBB8_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.ne .LBB8_1 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: sub sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 328736 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #80, lsl #12 // =327680 +; CHECK-NEXT: .cfi_def_cfa_offset 1056 +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 328720, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; A small allocation, but with a very large alignment requirement. We do this +; by moving SP far enough that a sufficiently-aligned block will exist +; somewhere in the stack frame, so must probe the whole of that larger SP move. +define void @static_16_align_131072(i8** %out) #0 { +; CHECK-LABEL: static_16_align_131072: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #31, lsl #12 // =126976 +; CHECK-NEXT: sub x9, x9, #4080 +; CHECK-NEXT: and x9, x9, #0xfffffffffffe0000 +; CHECK-NEXT: .LBB9_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB9_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB9_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB9_1 +; CHECK-NEXT: .LBB9_3: // %entry +; CHECK-NEXT: str xzr, [x9] +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 16, align 131072 + store i8* %vla, i8** %out, align 8 + ret void +} + +; A small allocation, but with a very large alignment requirement which +; is nevertheless small enough as to not need a loop. +define void @static_16_align_8192(i8** %out) #0 { +; CHECK-LABEL: static_16_align_8192: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: sub x9, x9, #4080 +; CHECK-NEXT: and sp, x9, #0xffffffffffffe000 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 16, align 8192 + store i8* %vla, i8** %out, align 8 + ret void +} + +; A large allocation with a very large alignment requirement which +; is nevertheless small enough as to not need a loop. +define void @static_32752_align_32k(i8** %out) #0 { +; CHECK-LABEL: static_32752_align_32k: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #7, lsl #12 // =28672 +; CHECK-NEXT: sub x9, x9, #4080 +; CHECK-NEXT: and sp, x9, #0xffffffffffff8000 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 32752, align 32768 + store i8* %vla, i8** %out, align 8 + ret void +} + +attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" } Index: llvm/test/CodeGen/AArch64/stack-probing-sve.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/stack-probing-sve.ll @@ -0,0 +1,661 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s +; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s + +; Test prolog sequences for stack probing when SVE objects are involved. + +; The space for SVE objects needs probing in the general case, because +; the stack adjustment may happen to be too big (i.e. greater than the +; probe size) to allocate with a single `addvl`. +; When we do know that the stack adjustment cannot exceed the probe size +; we can avoid emitting a probe loop and emit a simple `addvl; str` +; sequence instead. + +define void @sve_1_vector(** %out) #0 { +; CHECK-LABEL: sve_1_vector: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec = alloca , align 16 + ret void +} + +; As above, but with 4 SVE vectors of stack space. +define void @sve_4_vector(** %out) #0 { +; CHECK-LABEL: sve_4_vector: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-4 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: addvl sp, sp, #4 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec1 = alloca , align 16 + %vec2 = alloca , align 16 + %vec3 = alloca , align 16 + %vec4 = alloca , align 16 + ret void +} + +; As above, but with 16 SVE vectors of stack space. +; The stack adjustment is less than or equal to 16 x 256 = 4096, so +; we can allocate the locals at once. +define void @sve_16_vector(** %out) #0 { +; CHECK-LABEL: sve_16_vector: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-16 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: addvl sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec1 = alloca , align 16 + %vec2 = alloca , align 16 + %vec3 = alloca , align 16 + %vec4 = alloca , align 16 + %vec5 = alloca , align 16 + %vec6 = alloca , align 16 + %vec7 = alloca , align 16 + %vec8 = alloca , align 16 + %vec9 = alloca , align 16 + %vec10 = alloca , align 16 + %vec11 = alloca , align 16 + %vec12 = alloca , align 16 + %vec13 = alloca , align 16 + %vec14 = alloca , align 16 + %vec15 = alloca , align 16 + %vec16 = alloca , align 16 + ret void +} + +; As above, but with 17 SVE vectors of stack space. Now we need +; a probing loops since stack adjustment may be greater than +; the probe size (17 x 256 = 4354 bytes) +define void @sve_17_vector(** %out) #0 { +; CHECK-LABEL: sve_17_vector: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl x9, sp, #-17 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 136 * VG +; CHECK-NEXT: .LBB3_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB3_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB3_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB3_1 +; CHECK-NEXT: .LBB3_3: // %entry +; CHECK-NEXT: str xzr, [x9] +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: addvl sp, sp, #17 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec1 = alloca , align 16 + %vec2 = alloca , align 16 + %vec3 = alloca , align 16 + %vec4 = alloca , align 16 + %vec5 = alloca , align 16 + %vec6 = alloca , align 16 + %vec7 = alloca , align 16 + %vec8 = alloca , align 16 + %vec9 = alloca , align 16 + %vec10 = alloca , align 16 + %vec11 = alloca , align 16 + %vec12 = alloca , align 16 + %vec13 = alloca , align 16 + %vec14 = alloca , align 16 + %vec15 = alloca , align 16 + %vec16 = alloca , align 16 + %vec17 = alloca , align 16 + ret void +} + +; Space for callee-saved SVE register is allocated similarly to allocating +; space for SVE locals. When we know the stack adjustment cannot exceed the +; probe size we can skip the explict probe, since saving SVE registers serves +; as an implicit probe. +define void @sve_1v_csr( %a) #0 { +; CHECK-LABEL: sve_1v_csr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: .cfi_restore z8 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + call void asm sideeffect "", "~{z8}" () + ret void +} + +define void @sve_4v_csr( %a) #0 { +; CHECK-LABEL: sve_4v_csr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-4 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: str z11, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr z11, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #4 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: .cfi_restore z8 +; CHECK-NEXT: .cfi_restore z9 +; CHECK-NEXT: .cfi_restore z10 +; CHECK-NEXT: .cfi_restore z11 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + call void asm sideeffect "", "~{z8},~{z9},~{z10},~{z11}" () + ret void +} + +define void @sve_16v_csr( %a) #0 { +; CHECK-LABEL: sve_16v_csr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-16 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: .cfi_restore z8 +; CHECK-NEXT: .cfi_restore z9 +; CHECK-NEXT: .cfi_restore z10 +; CHECK-NEXT: .cfi_restore z11 +; CHECK-NEXT: .cfi_restore z12 +; CHECK-NEXT: .cfi_restore z13 +; CHECK-NEXT: .cfi_restore z14 +; CHECK-NEXT: .cfi_restore z15 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + call void asm sideeffect "", "~{z8},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23}" () + ret void +} + +define void @sve_1p_csr( %a) #0 { +; CHECK-LABEL: sve_1p_csr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + call void asm sideeffect "", "~{p8}" () + ret void +} + +define void @sve_4p_csr( %a) #0 { +; CHECK-LABEL: sve_4p_csr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: str p11, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p10, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p9, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr p11, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + call void asm sideeffect "", "~{p8},~{p9},~{p10},~{p11}" () + ret void +} + +define void @sve_16v_1p_csr( %a) #0 { +; CHECK-LABEL: sve_16v_1p_csr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl x9, sp, #-17 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 136 * VG +; CHECK-NEXT: .LBB9_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB9_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB9_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB9_1 +; CHECK-NEXT: .LBB9_3: // %entry +; CHECK-NEXT: str xzr, [x9] +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #17 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: .cfi_restore z8 +; CHECK-NEXT: .cfi_restore z9 +; CHECK-NEXT: .cfi_restore z10 +; CHECK-NEXT: .cfi_restore z11 +; CHECK-NEXT: .cfi_restore z12 +; CHECK-NEXT: .cfi_restore z13 +; CHECK-NEXT: .cfi_restore z14 +; CHECK-NEXT: .cfi_restore z15 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + call void asm sideeffect "", "~{p8},~{z8},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23}" () + ret void +} + +; A SVE vector and a 16-byte fixed size object. +define void @sve_1_vector_16_arr(** %out) #0 { +; CHECK-LABEL: sve_1_vector_16_arr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: str xzr, [sp], #-16 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 8 * VG +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: .cfi_def_cfa wsp, 32 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec = alloca , align 16 + %arr = alloca i8, i64 16, align 1 + ret void +} + +; A large SVE stack object and a large stack slot, both of which need probing. +; TODO: This could be optimised by combining the fixed-size offset into the +; loop. +define void @sve_1_vector_4096_arr(** %out) #0 { +; CHECK-LABEL: sve_1_vector_4096_arr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl x9, sp, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 256 * VG +; CHECK-NEXT: addvl x9, x9, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 512 * VG +; CHECK-NEXT: .LBB11_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB11_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB11_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB11_1 +; CHECK-NEXT: .LBB11_3: // %entry +; CHECK-NEXT: str xzr, [x9] +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0x90, 0x20, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 4112 + 512 * VG +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0f, 0x8f, 0x00, 0x11, 0x90, 0xc0, 0x00, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 8208 + 512 * VG +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0f, 0x8f, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 12304 + 512 * VG +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0f, 0x8f, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x88, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 12304 + 264 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 12304 + 16 * VG +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: .cfi_def_cfa wsp, 12304 +; CHECK-NEXT: add sp, sp, #3, lsl #12 // =12288 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec = alloca , align 16 + %arr = alloca i8, i64 12288, align 1 + ret void +} + +; Not tested: SVE stack objects with alignment >16 bytes, which isn't currently +; supported even without stack-probing. + +; An SVE vector, and a 16-byte fixed size object, which +; has a large alignment requirement. +define void @sve_1_vector_16_arr_align_8192(** %out) #0 { +; CHECK-LABEL: sve_1_vector_16_arr_align_8192: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub x9, x9, #4080 +; CHECK-NEXT: and x9, x9, #0xffffffffffffe000 +; CHECK-NEXT: .LBB12_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB12_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB12_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB12_1 +; CHECK-NEXT: .LBB12_3: // %entry +; CHECK-NEXT: str xzr, [x9] +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec = alloca , align 16 + %arr = alloca i8, i64 16, align 8192 + ret void +} + +; With 64k guard pages, we can allocate bigger SVE space without a probing loop. +define void @sve_1024_64k_guard(** %out) #0 "stack-probe-size"="65536" { +; CHECK-LABEL: sve_1024_64k_guard: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 256 * VG +; CHECK-NEXT: addvl sp, sp, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 512 * VG +; CHECK-NEXT: addvl sp, sp, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 768 * VG +; CHECK-NEXT: addvl sp, sp, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1024 * VG +; CHECK-NEXT: addvl sp, sp, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1280 * VG +; CHECK-NEXT: addvl sp, sp, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1536 * VG +; CHECK-NEXT: addvl sp, sp, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1792 * VG +; CHECK-NEXT: addvl sp, sp, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 2048 * VG +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1800 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1552 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x98, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1304 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1056 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa8, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 808 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb0, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 560 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb8, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 312 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG +; CHECK-NEXT: addvl sp, sp, #8 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec = alloca , align 16 + ret void +} + +define void @sve_1028_64k_guard(** %out) #0 "stack-probe-size"="65536" { +; CHECK-LABEL: sve_1028_64k_guard: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl x9, sp, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 256 * VG +; CHECK-NEXT: addvl x9, x9, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 512 * VG +; CHECK-NEXT: addvl x9, x9, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 768 * VG +; CHECK-NEXT: addvl x9, x9, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1024 * VG +; CHECK-NEXT: addvl x9, x9, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1280 * VG +; CHECK-NEXT: addvl x9, x9, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1536 * VG +; CHECK-NEXT: addvl x9, x9, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1792 * VG +; CHECK-NEXT: addvl x9, x9, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 2048 * VG +; CHECK-NEXT: addvl x9, x9, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 2056 * VG +; CHECK-NEXT: .LBB14_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB14_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB14_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB14_1 +; CHECK-NEXT: .LBB14_3: // %entry +; CHECK-NEXT: str xzr, [x9] +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1808 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x98, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1560 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1312 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa8, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1064 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb0, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 816 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb8, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 568 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 320 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc8, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 72 * VG +; CHECK-NEXT: addvl sp, sp, #9 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec = alloca , align 16 + %vec1 = alloca , align 16 + ret void +} + +attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" } Index: llvm/test/CodeGen/AArch64/stack-probing.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/stack-probing.ll @@ -0,0 +1,475 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false | FileCheck %s +; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false -global-isel | FileCheck %s + +; Tests for prolog sequences for stack probing, when using a 4KiB stack guard. + +; Small stack frame, no probing required. +define void @static_64(i8** %out) #0 { +; CHECK-LABEL: static_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #64 +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #64 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 64, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; At 256 bytes we start to always create a frame pointer. No frame smaller then +; this needs a probe, so we can use the saving of at least one CSR as a probe +; at the top of our frame. +define void @static_256(i8** %out) #0 { +; CHECK-LABEL: static_256: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #272 +; CHECK-NEXT: .cfi_def_cfa_offset 272 +; CHECK-NEXT: str x29, [sp, #256] // 8-byte Folded Spill +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #272 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 256, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; At 1024 bytes, this is the largest frame which doesn't need probing. +define void @static_1024(i8** %out) #0 { +; CHECK-LABEL: static_1024: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 1040 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 1024, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; At 1024+16 bytes, this is the smallest frame which needs probing. +define void @static_1040(i8** %out) #0 { +; CHECK-LABEL: static_1040: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 1056 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 1040, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 4k bytes is the largest frame we can probe in one go. +define void @static_4096(i8** %out) #0 { +; CHECK-LABEL: static_4096: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 4112 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 4096, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 4k+16 bytes, still needs just one probe. +define void @static_4112(i8** %out) #0 { +; CHECK-LABEL: static_4112: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 4112 +; CHECK-NEXT: str xzr, [sp], #-16 +; CHECK-NEXT: .cfi_def_cfa_offset 4128 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 4112, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 4k+1024 bytes, the largest frame which needs just one probe. +define void @static_5120(i8** %out) #0 { +; CHECK-LABEL: static_5120: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 4112 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 5136 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 1040 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 5120, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 4k+1024+16, the smallest frame which needs two probes. +define void @static_5136(i8** %out) #0 { +; CHECK-LABEL: static_5136: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 4112 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 5152 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 1056 +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 5136, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 2*4k+1024, the largest frame needing two probes +define void @static_9216(i8** %out) #0 { +; CHECK-LABEL: static_9216: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 4112 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 8208 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 9232 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #2, lsl #12 // =8192 +; CHECK-NEXT: .cfi_def_cfa_offset 1040 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 9216, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 5*4k-16, the largest frame probed without a loop +define void @static_20464(i8** %out) #0 { +; CHECK-LABEL: static_20464: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 4112 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 8208 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 12304 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 16400 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #4080 +; CHECK-NEXT: .cfi_def_cfa_offset 20480 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #4, lsl #12 // =16384 +; CHECK-NEXT: .cfi_def_cfa_offset 4096 +; CHECK-NEXT: add sp, sp, #4080 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 20464, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 5*4k, the smallest frame probed with a loop +define void @static_20480(i8** %out) #0 { +; CHECK-LABEL: static_20480: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #5, lsl #12 // =20480 +; CHECK-NEXT: .cfi_def_cfa w9, 20496 +; CHECK-NEXT: .LBB10_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.ne .LBB10_1 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #5, lsl #12 // =20480 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 20480, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 5*4k + 1024, large enough to use a loop, but not a multiple of 4KiB +; so has a reminder, but no extra probe. +define void @static_21504(i8** %out) #0 { +; CHECK-LABEL: static_21504: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #5, lsl #12 // =20480 +; CHECK-NEXT: .cfi_def_cfa w9, 20496 +; CHECK-NEXT: .LBB11_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.ne .LBB11_1 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 21520 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #5, lsl #12 // =20480 +; CHECK-NEXT: .cfi_def_cfa_offset 1040 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 21504, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; 5*4k+1040, large enough to use a loop, has a reminder and +; an extra probe. +define void @static_21520(i8** %out) #0 { +; CHECK-LABEL: static_21520: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #5, lsl #12 // =20480 +; CHECK-NEXT: .cfi_def_cfa w9, 20496 +; CHECK-NEXT: .LBB12_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.ne .LBB12_1 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: sub sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 21536 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #5, lsl #12 // =20480 +; CHECK-NEXT: .cfi_def_cfa_offset 1056 +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 21520, align 1 + store i8* %vla, i8** %out, align 8 + ret void +} + +; A small allocation, but with a very large alignment requirement. We do this +; by moving SP far enough that a sufficiently-aligned block will exist +; somewhere in the stack frame, so must probe the whole of that larger SP move. +define void @static_16_align_8192(i8** %out) #0 { +; CHECK-LABEL: static_16_align_8192: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: sub x9, x9, #4080 +; CHECK-NEXT: and x9, x9, #0xffffffffffffe000 +; CHECK-NEXT: .LBB13_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB13_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB13_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB13_1 +; CHECK-NEXT: .LBB13_3: // %entry +; CHECK-NEXT: str xzr, [x9] +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 16, align 8192 + store i8* %vla, i8** %out, align 8 + ret void +} + +; A small allocation with a very large alignment requirement, but +; nevertheless small enough as to not need a loop. +define void @static_16_align_2048(i8** %out) #0 { +; CHECK-LABEL: static_16_align_2048: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #2032 +; CHECK-NEXT: and sp, x9, #0xfffffffffffff800 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 16, align 2048 + store i8* %vla, i8** %out, align 8 + ret void +} + +; A large(-ish) allocation with a very large alignment requirement, but +; nevertheless small enough as to not need a loop. +define void @static_2032_align_2048(i8** %out) #0 { +; CHECK-LABEL: static_2032_align_2048: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #2032 +; CHECK-NEXT: and sp, x9, #0xfffffffffffff800 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vla = alloca i8, i64 2032, align 2048 + store i8* %vla, i8** %out, align 8 + ret void +} + +attributes #0 = {uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none"} +