diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -2257,8 +2257,12 @@ if ((!D || !D->hasAttr()) && CodeGenOpts.UnwindTables) B.addUWTableAttr(llvm::UWTableKind(CodeGenOpts.UnwindTables)); - if (CodeGenOpts.StackClashProtector) + if (CodeGenOpts.StackClashProtector) { B.addAttribute("probe-stack", "inline-asm"); + if (CodeGenOpts.StackProbeSize != 4096) + B.addAttribute("stack-probe-size", + llvm::utostr(CodeGenOpts.StackProbeSize)); + } if (!hasUnwindExceptions(LangOpts)) B.addAttribute(llvm::Attribute::NoUnwind); diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -3447,11 +3447,18 @@ return; if (!EffectiveTriple.isX86() && !EffectiveTriple.isSystemZ() && - !EffectiveTriple.isPPC64()) + !EffectiveTriple.isPPC64() && !EffectiveTriple.isARM() && + !EffectiveTriple.isThumb()) return; Args.addOptInFlag(CmdArgs, options::OPT_fstack_clash_protection, options::OPT_fno_stack_clash_protection); + if (Args.hasArg(options::OPT_mstack_probe_size)) { + StringRef Size = Args.getLastArgValue(options::OPT_mstack_probe_size); + CmdArgs.push_back(Args.MakeArgString("-mstack-probe-size=" + Size)); + } else if (EffectiveTriple.isArm() || EffectiveTriple.isThumb()) { + CmdArgs.push_back("-mstack-probe-size=1024"); + } } static void RenderTrivialAutoVarInitOptions(const Driver &D, diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp --- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -2040,6 +2040,15 @@ } } +static void EmitDynamicProbedAlloc(MachineInstr &MI, MachineBasicBlock *MBB) { + Register TargetReg = MI.getOperand(0).getReg(); + MachineFunction *MF = MBB->getParent(); + const ARMFrameLowering *TFI = + MF->getSubtarget().getFrameLowering(); + TFI->insertStackProbingLoop(MI, TargetReg); + MI.eraseFromParent(); +} + bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineBasicBlock::iterator &NextMBBI) { @@ -2048,7 +2057,11 @@ switch (Opcode) { default: return false; - + case ARM::PROBED_STACKALLOC_DYN: { + EmitDynamicProbedAlloc(MI, &MBB); + NextMBBI = MBB.end(); + return true; + } case ARM::VBSPd: case ARM::VBSPq: { Register DstReg = MI.getOperand(0).getReg(); diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.h b/llvm/lib/Target/ARM/ARMFrameLowering.h --- a/llvm/lib/Target/ARM/ARMFrameLowering.h +++ b/llvm/lib/Target/ARM/ARMFrameLowering.h @@ -79,6 +79,10 @@ const SpillSlot * getCalleeSavedSpillSlots(unsigned &NumEntries) const override; + virtual MachineBasicBlock::iterator + insertStackProbingLoop(MachineBasicBlock::iterator MBBI, + Register TargetReg) const ; + private: void emitPushInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, ArrayRef CSI, unsigned StmOpc, @@ -94,6 +98,15 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override; + /// Replace a StackProbe stub (if any) with the actual probe code inline + void inlineStackProbe(MachineFunction &MF, + MachineBasicBlock &PrologueMBB) const override; + MachineBasicBlock::iterator + inlineStackProbeFixed(MachineFunction &MF, + MachineBasicBlock::iterator MBBI) const; + MachineBasicBlock::iterator + inlineStackProbeVar(MachineFunction &MF, + MachineBasicBlock::iterator MBBI) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp --- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp +++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp @@ -120,6 +120,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -173,6 +174,8 @@ skipAlignedDPRCS2Spills(MachineBasicBlock::iterator MI, unsigned NumAlignedDPRCS2Regs); +static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB); + ARMFrameLowering::ARMFrameLowering(const ARMSubtarget &sti) : TargetFrameLowering(StackGrowsDown, sti.getStackAlignment(), 0, Align(4)), STI(sti) {} @@ -739,12 +742,13 @@ const MCRegisterInfo *MRI = Context.getRegisterInfo(); const ARMBaseRegisterInfo *RegInfo = STI.getRegisterInfo(); const ARMBaseInstrInfo &TII = *STI.getInstrInfo(); + const ARMTargetLowering &TLI = *STI.getTargetLowering(); assert(!AFI->isThumb1OnlyFunction() && "This emitPrologue does not support Thumb1!"); bool isARM = !AFI->isThumbFunction(); Align Alignment = STI.getFrameLowering()->getStackAlign(); unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(); - unsigned NumBytes = MFI.getStackSize(); + int NumBytes = MFI.getStackSize(); const std::vector &CSI = MFI.getCalleeSavedInfo(); int FPCXTSaveSize = 0; bool NeedsWinCFI = needsWinCFI(MF); @@ -1038,11 +1042,21 @@ } if (NumBytes) { + bool NeedsStackProbe = TLI.hasInlineStackProbe(MF) && + (NumBytes >= TLI.getStackProbeMaxUnprobedStack(MF) || + MFI.hasVarSizedObjects()); + bool NeedsRealignment = RegInfo->hasStackRealignment(MF); // Adjust SP after all the callee-save spills. if (AFI->getNumAlignedDPRCS2Regs() == 0 && tryFoldSPUpdateIntoPushPop(STI, MF, &*LastPush, NumBytes)) DefCFAOffsetCandidates.addExtraBytes(LastPush, NumBytes); - else { + else if (NeedsStackProbe && !NeedsRealignment) { + Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB); + assert(ScratchReg != ARM::NoRegister); + BuildMI(MBB, MBBI, dl, TII.get(ARM::PROBED_STACKALLOC)) + .addDef(ScratchReg) + .addImm(-NumBytes); + } else { emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes, MachineInstr::FrameSetup); DefCFAOffsetCandidates.addInst(std::prev(MBBI), NumBytes); @@ -1222,8 +1236,19 @@ Align MaxAlign = MFI.getMaxAlign(); assert(!AFI->isThumb1OnlyFunction()); if (!AFI->isThumbFunction()) { - emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ARM::SP, MaxAlign, - false); + bool NeedsStackProbe = TLI.hasInlineStackProbe(MF) && + (NumBytes + MFI.getMaxAlign().value()) >= + TLI.getStackProbeMaxUnprobedStack(MF); + if (NeedsStackProbe) { + Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB); + emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ScratchReg, + MaxAlign, false); + BuildMI(MBB, MBBI, dl, TII.get(ARM::PROBED_STACKALLOC_VAR)) + .addUse(ScratchReg); + } else { + emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ARM::SP, MaxAlign, + false); + } } else { // We cannot use sp as source/dest register here, thus we're using r4 to // perform the calculations. We're emitting the following sequence: @@ -1232,14 +1257,29 @@ // -- out lower bits in r4 // mov sp, r4 // FIXME: It will be better just to find spare register here. - BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::R4) - .addReg(ARM::SP, RegState::Kill) - .add(predOps(ARMCC::AL)); - emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ARM::R4, MaxAlign, - false); - BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP) - .addReg(ARM::R4, RegState::Kill) - .add(predOps(ARMCC::AL)); + bool NeedsStackProbe = TLI.hasInlineStackProbe(MF) && + (NumBytes + MFI.getMaxAlign().value()) >= + TLI.getStackProbeMaxUnprobedStack(MF); + if (NeedsStackProbe) { + BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVr), ARM::R4) + .addReg(ARM::SP, RegState::Kill) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()); + + emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ARM::R4, MaxAlign, + false); + BuildMI(MBB, MBBI, dl, TII.get(ARM::PROBED_STACKALLOC_VAR)) + .addUse(ARM::R4); + } else { + BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::R4) + .addReg(ARM::SP, RegState::Kill) + .add(predOps(ARMCC::AL)); + emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ARM::R4, MaxAlign, + false); + BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP) + .addReg(ARM::R4, RegState::Kill) + .add(predOps(ARMCC::AL)); + } } AFI->setShouldRestoreSPFromFP(true); @@ -3385,3 +3425,379 @@ MF.verify(); #endif } + +/// Emit a loop to decrement SP until it is equal to TargetReg, with probes at +/// least every NegProbeSize bytes. Returns an iterator of the first +/// instruction after the loop. The difference between SP and TargetReg must be +/// an exact multiple of NegProbeSize. + +static MachineBasicBlock::iterator inlineStackProbeLoopExactMultiple( + MachineFunction &MF, MachineBasicBlock::iterator MBBI, int64_t NegProbeSize, + Register TargetReg, bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFI) { + MachineBasicBlock &MBB = *MBBI->getParent(); + ARMFunctionInfo *AFI = MF.getInfo(); + bool isARM = !AFI->isThumbFunction(); + const ARMBaseInstrInfo &TII = + *static_cast(MF.getSubtarget().getInstrInfo()); + DebugLoc DL = MBB.findDebugLoc(MBBI); + + MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator()); + MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, LoopMBB); + MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, ExitMBB); + + if (isARM) { + // ADD SP, SP, #NegFrameSize (or equivalent if NegFrameSize is not encodable + // in ADD). + auto loopMBBend = LoopMBB->end(); + emitSPUpdate(true, *LoopMBB, loopMBBend, DL, TII, NegProbeSize, + MachineInstr::FrameSetup); + + // STR TargetReg, [SP, #StackClashCallerGuard] + BuildMI(*LoopMBB, LoopMBB->end(), DL, TII.get(ARM::STRi12)) + .addReg(TargetReg) + .addReg(ARM::SP) + .addImm(ARM::StackClashCallerGuard) + .addImm(ARMCC::AL) + .addImm(0); + + // CMP SP, TargetReg + BuildMI(*LoopMBB, LoopMBB->end(), DL, TII.get(ARM::CMPrr)) + .addReg(ARM::SP) + .addReg(TargetReg) + .add(predOps(ARMCC::AL)); + + // B.CC Loop + BuildMI(*LoopMBB, LoopMBB->end(), DL, TII.get(ARM::Bcc)) + .addMBB(LoopMBB) + .addImm(ARMCC::NE) + .addReg(ARM::CPSR); + } else { + // ADD SP, SP, #NegFrameSize (or equivalent if NegFrameSize is not encodable + // in ADD). + auto loopMBBend = LoopMBB->end(); + emitSPUpdate(isARM, *LoopMBB, loopMBBend, DL, TII, NegProbeSize, + MachineInstr::FrameSetup); + + // STR TargetReg, [SP, #StackClashCallerGuard] + BuildMI(*LoopMBB, LoopMBB->end(), DL, TII.get(ARM::t2STRi12)) + .addReg(TargetReg) + .addReg(ARM::SP) + .addImm(ARM::StackClashCallerGuard) + .add(predOps(ARMCC::AL)); + + // CMP SP, TargetReg + BuildMI(*LoopMBB, LoopMBB->end(), DL, TII.get(ARM::t2CMPrr)) + .addReg(ARM::SP) + .addReg(TargetReg) + .add(predOps(ARMCC::AL)); + + // B.CC Loop + BuildMI(*LoopMBB, LoopMBB->end(), DL, TII.get(ARM::t2Bcc)) + .addMBB(LoopMBB) + .addImm(ARMCC::NE) + .addReg(ARM::CPSR); + } + + LoopMBB->addSuccessor(ExitMBB); + LoopMBB->addSuccessor(LoopMBB); + // Synthesize the exit MBB. + ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end()); + ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB); + MBB.addSuccessor(LoopMBB); + // Update liveins. + recomputeLiveIns(*LoopMBB); + recomputeLiveIns(*ExitMBB); + + return ExitMBB->begin(); +} + +MachineBasicBlock::iterator ARMFrameLowering::inlineStackProbeFixed( + MachineFunction &MF, MachineBasicBlock::iterator MBBI) const { + MachineBasicBlock &MBB = *MBBI->getParent(); + const ARMTargetLowering *TLI = + MF.getSubtarget().getTargetLowering(); + const ARMBaseInstrInfo &TII = + *static_cast(MF.getSubtarget().getInstrInfo()); + + ARMFunctionInfo *AFI = MF.getInfo(); + bool isARM = !AFI->isThumbFunction(); + bool HasFP = hasFP(MF); + bool NeedsWinCFI = needsWinCFI(MF); + bool EmitCFI = !NeedsWinCFI; + bool HasWinCFI = false; + + DebugLoc DL = MBB.findDebugLoc(MBBI); + Register ScratchReg = MBBI->getOperand(0).getReg(); + int64_t NegFrameSize = MBBI->getOperand(1).getImm(); + int64_t NegProbeSize = -(int64_t)TLI->getStackProbeSize(MF); + int64_t NumBlocks = NegFrameSize / NegProbeSize; + int64_t NegResidualSize = NegFrameSize % NegProbeSize; + bool NeedResidualProbe = + NegResidualSize <= -(int64_t)TLI->getStackProbeMaxUnprobedStack(MF); + bool UnrollProbeLoop = NumBlocks <= ARM::StackClashCallerMaxUnrollPage; + LLVM_DEBUG(dbgs() << "Stack probing (fixed): total " << NegFrameSize + << " bytes, " << NumBlocks << " blocks of " << NegProbeSize + << " bytes, " << NeedResidualProbe << " block of " + << NegResidualSize << " bytes" + << " (residual), Unroll: " << UnrollProbeLoop << ", " + << "CFI: " << (EmitCFI && !HasFP) << "\n"); + + MachineBasicBlock::iterator NextInst; + if (UnrollProbeLoop) { + for (int i = 0; i < NumBlocks; ++i) { + if (isARM) { + // ADD SP, SP, #NegFrameSize (or equivalent if NegFrameSize is not + // encodable in ADD). + emitSPUpdate(isARM, MBB, MBBI, DL, TII, NegProbeSize, + MachineInstr::NoFlags); + // STR ScratchReg, [SP, #StackClashCallerGuard] + BuildMI(MBB, MBBI, DL, TII.get(ARM::STRi12)) + .addReg(ScratchReg) + .addReg(ARM::SP) + .addImm(ARM::StackClashCallerGuard) + .addImm(ARMCC::AL) + .addImm(0); + } else { + + emitSPUpdate(isARM, MBB, MBBI, DL, TII, NegProbeSize, + MachineInstr::FrameSetup); + + // STR ScratchReg, [SP, #StackClashCallerGuard] + BuildMI(MBB, MBBI, DL, TII.get(ARM::t2STRi12)) + .addReg(ScratchReg) + .addReg(ARM::SP) + .addImm(ARM::StackClashCallerGuard) + .add(predOps(ARMCC::AL)); + } + } + NextInst = std::next(MBBI); + } else if (NumBlocks != 0) { + // ADD ScratchReg, SP, #NegFrameSize (or equivalent if NegFrameSize is not + // encodable in ADD). + // TODO: + emitRegPlusImmediate(isARM, MBB, MBBI, DL, TII, ScratchReg, ARM::SP, + NegFrameSize, MachineInstr::NoFlags, ARMCC::AL, 0); + + NextInst = inlineStackProbeLoopExactMultiple( + MF, MBBI, NegProbeSize, ScratchReg, NeedsWinCFI, &HasWinCFI, EmitCFI); + } + + if (NegResidualSize != 0) { + // ADD SP, SP, #NegFrameSize (or equivalent if NegFrameSize is not encodable + // in ADD). + if (isARM) { + emitSPUpdate(isARM, MBB, MBBI, DL, TII, NegResidualSize, + MachineInstr::FrameSetup); + if (NeedResidualProbe) { + // STR ScratchReg, [SP, #StackClashCallerGuard] + BuildMI(MBB, MBBI, DL, TII.get(ARM::STRi12)) + .addReg(ScratchReg) + .addReg(ARM::SP) + .addImm(ARM::StackClashCallerGuard) + .addImm(ARMCC::AL) + .addImm(0); + } + } else { + emitSPUpdate(isARM, MBB, MBBI, DL, TII, NegResidualSize, + MachineInstr::FrameSetup); + if (NeedResidualProbe) { + // STR ScratchReg, [SP, #StackClashCallerGuard] + BuildMI(MBB, MBBI, DL, TII.get(ARM::t2STRi12)) + .addReg(ScratchReg) + .addReg(ARM::SP) + .addImm(ARM::StackClashCallerGuard) + .add(predOps(ARMCC::AL)); + } + } + } + + MBBI->eraseFromParent(); + return NextInst; +} + +MachineBasicBlock::iterator +ARMFrameLowering::inlineStackProbeVar(MachineFunction &MF, + MachineBasicBlock::iterator MBBI) const { + MachineBasicBlock &MBB = *MBBI->getParent(); + + DebugLoc DL = MBB.findDebugLoc(MBBI); + Register TargetReg = MBBI->getOperand(0).getReg(); + MachineBasicBlock::iterator NextInst = std::next(MBBI); + + NextInst = insertStackProbingLoop(MBBI, TargetReg); + + MBBI->eraseFromParent(); + return NextInst; +} + +MachineBasicBlock::iterator +ARMFrameLowering::insertStackProbingLoop(MachineBasicBlock::iterator MBBI, + Register TargetReg) const { + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineFunction &MF = *MBB.getParent(); + ARMFunctionInfo *AFI = MF.getInfo(); + bool isARM = !AFI->isThumbFunction(); + const ARMTargetLowering *TLI = + MF.getSubtarget().getTargetLowering(); + const ARMBaseInstrInfo &TII = + *static_cast(MF.getSubtarget().getInstrInfo()); + + int64_t NegProbeSize = -(int64_t)TLI->getStackProbeSize(MF); + DebugLoc DL = MBB.findDebugLoc(MBBI); + + MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator()); + MachineBasicBlock *LoopTestMBB = + MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, LoopTestMBB); + MachineBasicBlock *LoopBodyMBB = + MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, LoopBodyMBB); + MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, ExitMBB); + + // LoopTest: + // SUB SP, SP, #ProbeSize + if (isARM) { + MachineBasicBlock::iterator LoopTestMBBItr = LoopTestMBB->end(); + emitSPUpdate(isARM, *LoopTestMBB, LoopTestMBBItr, DL, TII, NegProbeSize, + MachineInstr::NoFlags); + + // CMP SP, TargetReg + BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII.get(ARM::CMPrr)) + .addReg(ARM::SP) + .addReg(TargetReg) + .add(predOps(ARMCC::AL)); + + // B.LE LoopExit + BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII.get(ARM::Bcc)) + .addMBB(ExitMBB) + .addImm(ARMCC::LE) + .addReg(ARM::CPSR); + + // STR TargetReg, [SP, #StackClashCallerGuard] + BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII.get(ARM::STRi12)) + .addReg(TargetReg) + .addReg(ARM::SP) + .addImm(ARM::StackClashCallerGuard) + .addImm(ARMCC::AL) + .addImm(0); + + // B loop + BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII.get(ARM::B)) + .addMBB(LoopTestMBB); + + // LoopExit: + // MOV SP, TargetReg + BuildMI(*ExitMBB, ExitMBB->end(), DL, TII.get(ARM::MOVr), ARM::SP) + .addReg(TargetReg) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()); + + // STR TargetReg, [SP, #StackClashCallerGuard] + BuildMI(*ExitMBB, ExitMBB->end(), DL, TII.get(ARM::STRi12)) + .addReg(TargetReg) + .addReg(ARM::SP) + .addImm(ARM::StackClashCallerGuard) + .addImm(ARMCC::AL) + .addImm(0); + } else { + MachineBasicBlock::iterator LoopTestMBBItr = LoopTestMBB->end(); + emitSPUpdate(isARM, *LoopTestMBB, LoopTestMBBItr, DL, TII, NegProbeSize, + MachineInstr::FrameSetup); + + // CMP SP, TargetReg + BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII.get(ARM::t2CMPrr)) + .addReg(ARM::SP) + .addReg(TargetReg) + .add(predOps(ARMCC::AL)); + + // B.LE LoopExit + BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII.get(ARM::t2Bcc)) + .addMBB(ExitMBB) + .addImm(ARMCC::LE) + .addReg(ARM::CPSR); + + // STR TargetReg, [SP, #StackClashCallerGuard] + BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII.get(ARM::t2STRi12)) + .addReg(TargetReg) + .addReg(ARM::SP) + .addImm(ARM::StackClashCallerGuard) + .add(predOps(ARMCC::AL)); + + // B loop + BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII.get(ARM::t2B)) + .addMBB(LoopTestMBB) + .add(predOps(ARMCC::AL)); + + // LoopExit: + // MOV SP, TargetReg + BuildMI(*ExitMBB, ExitMBB->end(), DL, TII.get(ARM::t2MOVr), ARM::SP) + .addReg(TargetReg) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()); + + // STR TargetReg, [SP, #StackClashCallerGuard] + BuildMI(*ExitMBB, ExitMBB->end(), DL, TII.get(ARM::t2STRi12)) + .addReg(TargetReg) + .addReg(ARM::SP) + .addImm(ARM::StackClashCallerGuard) + .add(predOps(ARMCC::AL)); + } + + LoopTestMBB->addSuccessor(ExitMBB); + LoopTestMBB->addSuccessor(LoopBodyMBB); + LoopBodyMBB->addSuccessor(LoopTestMBB); + + // Synthesize the exit MBB. + ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end()); + ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB); + MBB.addSuccessor(LoopTestMBB); + + // Update liveins. + if (MF.getRegInfo().reservedRegsFrozen()) { + recomputeLiveIns(*LoopTestMBB); + recomputeLiveIns(*LoopBodyMBB); + recomputeLiveIns(*ExitMBB); + } + + return ExitMBB->begin(); +} + +void ARMFrameLowering::inlineStackProbe(MachineFunction &MF, + MachineBasicBlock &MBB) const { + for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E;) { + if (MBBI->getOpcode() == ARM::PROBED_STACKALLOC) { + MBBI = inlineStackProbeFixed(MF, MBBI); + E = MBBI->getParent()->end(); + } else if (MBBI->getOpcode() == ARM::PROBED_STACKALLOC_VAR) { + MBBI = inlineStackProbeVar(MF, MBBI); + E = MBBI->getParent()->end(); + } else { + ++MBBI; + } + } +} + +static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) { + MachineFunction *MF = MBB->getParent(); + + const ARMSubtarget &Subtarget = MF->getSubtarget(); + const ARMBaseRegisterInfo TRI = *Subtarget.getRegisterInfo(); + LivePhysRegs LiveRegs(TRI); + LiveRegs.addLiveIns(*MBB); + + // Mark callee saved registers as used so we will not choose them. + const MCPhysReg *CSRegs = MF->getRegInfo().getCalleeSavedRegs(); + for (unsigned i = 0; CSRegs[i]; ++i) + LiveRegs.addReg(CSRegs[i]); + + const MachineRegisterInfo &MRI = MF->getRegInfo(); + const TargetRegisterClass &RC = ARM::GPRRegClass; + for (unsigned Reg : RC) { + if (LiveRegs.available(MRI, Reg)) + return Reg; + } + return ARM::NoRegister; +} diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -112,6 +112,10 @@ SUBE, // Sub using carry LSLS, // Shift left producing carry + // Dynamic stack allocation with stack clash protection, allocation is done + // by blocks and each block is probed with a zero store. + PROBED_ALLOCA, + VMOVRRD, // double to two gprs. VMOVDRR, // Two gprs to double. VMOVSR, // move gpr to single, used for f32 literal constructed in a gpr @@ -754,6 +758,17 @@ ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator = nullptr) const override; + /// True if stack clash protection is enabled for this functions. + bool hasInlineStackProbe(const MachineFunction &MF) const override; + + /// Get the interval between stack-clash probes, which is equal to the stack + /// guard size, in bytes. + unsigned getStackProbeSize(MachineFunction &MF) const; + + /// Get the maximum allowed number of unprobed bytes above SP at an ABI + /// boundary. + unsigned getStackProbeMaxUnprobedStack(MachineFunction &MF) const; + protected: std::pair findRepresentativeClass(const TargetRegisterInfo *TRI, @@ -851,6 +866,8 @@ SDValue &Chain) const; SDValue LowerREM(SDNode *N, SelectionDAG &DAG) const; SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; @@ -993,7 +1010,12 @@ FastISel *createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo); - + // The amount of bytes a caller is allowed update the stack before emit a + // probe required by stack clash protection. + static constexpr unsigned StackClashCallerGuard = 1024; + // How many time to unroll the loop for large outgoing argument during stack + // clash protection probing. + static constexpr unsigned StackClashCallerMaxUnrollPage = 4; } // end namespace ARM } // end namespace llvm diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1311,10 +1311,7 @@ setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); - if (Subtarget->isTargetWindows()) - setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); - else - setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use // the default expansion. @@ -1729,6 +1726,7 @@ MAKE_CASE(ARMISD::TC_RETURN) MAKE_CASE(ARMISD::THREAD_POINTER) MAKE_CASE(ARMISD::DYN_ALLOC) + MAKE_CASE(ARMISD::PROBED_ALLOCA) MAKE_CASE(ARMISD::MEMBARRIER_MCR) MAKE_CASE(ARMISD::PRELOAD) MAKE_CASE(ARMISD::LDRD) @@ -10585,9 +10583,7 @@ case ISD::SDIVREM: case ISD::UDIVREM: return LowerDivRem(Op, DAG); case ISD::DYNAMIC_STACKALLOC: - if (Subtarget->isTargetWindows()) - return LowerDYNAMIC_STACKALLOC(Op, DAG); - llvm_unreachable("Don't know how to custom lower this!"); + return LowerDYNAMIC_STACKALLOC(Op, DAG); case ISD::STRICT_FP_ROUND: case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); case ISD::STRICT_FP_EXTEND: @@ -20745,7 +20741,7 @@ } SDValue -ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { +ARMTargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetWindows() && "unsupported target platform"); SDLoc DL(Op); @@ -22129,3 +22125,82 @@ return nullptr; } + +bool ARMTargetLowering::hasInlineStackProbe(const MachineFunction &MF) const { + // If the function specifically requests inline stack probes, emit them. + if (MF.getFunction().hasFnAttribute("probe-stack")) { + if (MF.getFunction().getFnAttribute("probe-stack").getValueAsString() == + "inline-asm") + return true; + else + llvm_unreachable("Unsupported stack probing method"); + } + + return false; +} + +unsigned ARMTargetLowering::getStackProbeSize(MachineFunction &MF) const { + const TargetFrameLowering *TFI = Subtarget->getFrameLowering(); + unsigned StackAlign = TFI->getStackAlignment(); + assert(StackAlign >= 1 && isPowerOf2_32(StackAlign) && + "Unexpected stack alignment"); + // The default stack probe size is 4096 if the function has no + // stack-probe-size attribute. This is a safe default because it is the + // smallest possible guard page size. + unsigned StackProbeSize = 4096; + const Function &Fn = MF.getFunction(); + if (Fn.hasFnAttribute("stack-probe-size")) + Fn.getFnAttribute("stack-probe-size") + .getValueAsString() + .getAsInteger(0, StackProbeSize); + // Round down to the stack alignment. + StackProbeSize &= ~(StackAlign - 1); + return StackProbeSize ? StackProbeSize : StackAlign; +} + +unsigned +ARMTargetLowering::getStackProbeMaxUnprobedStack(MachineFunction &MF) const { + // Since the ABI requires save FP/LR or just LR for leaf functions, it acts as + // an implict stack probe. Probing at StackClashCallerGuard means that the + // rest of the guard page (assumed to be stack-probe-size attribute) can be + // used without further probing. + return getStackProbeSize(MF) - ARM::StackClashCallerGuard; +} + +SDValue +ARMTargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + // Get the inputs. + SDNode *Node = Op.getNode(); + SDValue Chain = Op.getOperand(0); + SDValue Size = Op.getOperand(1); + MaybeAlign Align = + cast(Op.getOperand(2))->getMaybeAlignValue(); + EVT VT = Node->getValueType(0); + + // Construct the new SP value in a GPR. + SDValue SP = DAG.getCopyFromReg(Chain, dl, ARM::SP, MVT::i32); + Chain = SP.getValue(1); + SP = DAG.getNode(ISD::SUB, dl, MVT::i32, SP, Size); + if (Align) + SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), + DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); + + // Set the real SP to the new value with a probing loop. + Chain = DAG.getNode(ARMISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP); + SDValue Ops[2] = {SP, Chain}; + return DAG.getMergeValues(Ops, dl); +} + +SDValue ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + + if (Subtarget->isTargetWindows()) + return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG); + if (hasInlineStackProbe(MF)) { + return LowerInlineDYNAMIC_STACKALLOC(Op, DAG); + } else + return SDValue(); +} diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -2169,6 +2169,10 @@ i32imm:$size), NoItinerary, []>; +def ARMprobedalloca + : SDNode<"ARMISD::PROBED_ALLOCA", + SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>, + [SDNPHasChain]>; // FIXME: Marking these as hasSideEffects is necessary to prevent machine DCE // from removing one half of the matched pairs. That breaks PEI, which assumes // these will always be in pairs, and asserts if it finds otherwise. Better way? @@ -2180,6 +2184,33 @@ def ADJCALLSTACKDOWN : PseudoInst<(outs), (ins i32imm:$amt, i32imm:$amt2, pred:$p), NoItinerary, [(ARMcallseq_start timm:$amt, timm:$amt2)]>; + +// Probed stack allocation of a constant size, used in function prologues when +// stack-clash protection is enabled. +def PROBED_STACKALLOC : PseudoInst<(outs GPR:$scratch), + (ins i32imm:$stacksize), + NoItinerary, + []>, + Sched<[]>; + +// Probed stack allocation of a variable size, used in function prologues when +// stack-clash protection is enabled. The register input is the target SP, +// which should be below the current value, and has no alignment requirements +// beyond the usual 16-byte alignment for SP. +def PROBED_STACKALLOC_VAR : PseudoInst<(outs), + (ins GPR:$target), + NoItinerary, + []>, + Sched<[]>; + +// Probed stack allocations of a variable size, used for allocas of unknown size +// when stack-clash protection is enabled. +def PROBED_STACKALLOC_DYN + : PseudoInst<(outs), + (ins GPR:$target), + NoItinerary, + [(ARMprobedalloca GPR:$target)]>, + Sched<[]>; } def HINT : AI<(outs), (ins imm0_239:$imm), MiscFrm, NoItinerary, diff --git a/llvm/lib/Target/ARM/Thumb1FrameLowering.h b/llvm/lib/Target/ARM/Thumb1FrameLowering.h --- a/llvm/lib/Target/ARM/Thumb1FrameLowering.h +++ b/llvm/lib/Target/ARM/Thumb1FrameLowering.h @@ -42,6 +42,16 @@ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override; + /// Replace a StackProbe stub (if any) with the actual probe code inline + void inlineStackProbe(MachineFunction &MF, + MachineBasicBlock &PrologueMBB) const override; + MachineBasicBlock::iterator + inlineStackProbeFixed(MachineFunction &MF, + MachineBasicBlock::iterator MBBI) const; + MachineBasicBlock::iterator + inlineStackProbeVar(MachineFunction &MF, + MachineBasicBlock::iterator MBBI) const; + /// Check whether or not the given \p MBB can be used as a epilogue /// for the target. /// The epilogue will be inserted before the first terminator of that block. @@ -54,6 +64,10 @@ return false; } + MachineBasicBlock::iterator + insertStackProbingLoop(MachineBasicBlock::iterator MBBI, + Register TargetReg) const override; + private: /// Check if the frame lowering of \p MF needs a special fixup /// code sequence for the epilogue. diff --git a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp --- a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp +++ b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -63,6 +63,28 @@ return !MFI.hasVarSizedObjects(); } +static unsigned +findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB, + const ThumbRegisterInfo *RegInfo) { + MachineFunction *MF = MBB->getParent(); + + LivePhysRegs LiveRegs(*RegInfo); + LiveRegs.addLiveIns(*MBB); + + // Mark callee saved registers as used so we will not choose them. + const MCPhysReg *CSRegs = MF->getRegInfo().getCalleeSavedRegs(); + for (unsigned i = 0; CSRegs[i]; ++i) + LiveRegs.addReg(CSRegs[i]); + + const MachineRegisterInfo &MRI = MF->getRegInfo(); + const TargetRegisterClass &RC = ARM::GPRRegClass; + for (unsigned Reg : RC) { + if (LiveRegs.available(MRI, Reg)) + return Reg; + } + return ARM::NoRegister; +} + static void emitPrologueEpilogueSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, @@ -157,7 +179,7 @@ *static_cast(STI.getInstrInfo()); unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(); - unsigned NumBytes = MFI.getStackSize(); + int NumBytes = MFI.getStackSize(); assert(NumBytes >= ArgRegsSaveSize && "ArgRegsSaveSize is included in NumBytes"); const std::vector &CSI = MFI.getCalleeSavedInfo(); @@ -414,21 +436,37 @@ } if (NumBytes) { - // Insert it after all the callee-save spills. - // - // For a large stack frame, we might need a scratch register to store - // the size of the frame. We know all callee-save registers are free - // at this point in the prologue, so pick one. - unsigned ScratchRegister = ARM::NoRegister; - for (auto &I : CSI) { - Register Reg = I.getReg(); - if (isARMLowRegister(Reg) && !(HasFP && Reg == FramePtr)) { - ScratchRegister = Reg; - break; + const ARMTargetLowering *TLI = + MF.getSubtarget().getTargetLowering(); + bool NeedsStackProbe = + TLI->hasInlineStackProbe(MF) && + (NumBytes >= TLI->getStackProbeMaxUnprobedStack(MF) || + MFI.hasVarSizedObjects()); + bool NeedsRelignment = RegInfo->hasStackRealignment(MF); + if (NeedsStackProbe && !NeedsRelignment) { + Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB, RegInfo); + assert(ScratchReg != ARM::NoRegister); + BuildMI(MBB, MBBI, dl, TII.get(ARM::PROBED_STACKALLOC)) + .addDef(ScratchReg) + .addImm(-NumBytes); + } else { + // Insert it after all the callee-save spills. + // + // For a large stack frame, we might need a scratch register to store + // the size of the frame. We know all callee-save registers are free + // at this point in the prologue, so pick one. + unsigned ScratchRegister = ARM::NoRegister; + for (auto &I : CSI) { + Register Reg = I.getReg(); + if (isARMLowRegister(Reg) && !(HasFP && Reg == FramePtr)) { + ScratchRegister = Reg; + break; + } } + emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes, + ScratchRegister, MachineInstr::FrameSetup); } - emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes, - ScratchRegister, MachineInstr::FrameSetup); + if (!HasFP) { CFAOffset += NumBytes; unsigned CFIIndex = MF.addFrameInst( @@ -449,6 +487,11 @@ if (RegInfo->hasStackRealignment(MF)) { const unsigned NrBitsToZero = Log2(MFI.getMaxAlign()); + const ARMTargetLowering *TLI = + MF.getSubtarget().getTargetLowering(); + bool NeedsStackProbe = TLI->hasInlineStackProbe(MF) && + (NumBytes + MFI.getMaxAlign().value()) >= + TLI->getStackProbeMaxUnprobedStack(MF); // Emit the following sequence, using R4 as a temporary, since we cannot use // SP as a source or destination register for the shifts: // mov r4, sp @@ -471,9 +514,14 @@ .addImm(NrBitsToZero) .add(predOps(ARMCC::AL)); - BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP) - .addReg(ARM::R4, RegState::Kill) - .add(predOps(ARMCC::AL)); + if (NeedsStackProbe) { + BuildMI(MBB, MBBI, dl, TII.get(ARM::PROBED_STACKALLOC_VAR)) + .addUse(ARM::R4); + } else { + BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP) + .addReg(ARM::R4, RegState::Kill) + .add(predOps(ARMCC::AL)); + } AFI->setShouldRestoreSPFromFP(true); } @@ -1202,3 +1250,254 @@ return true; } + +/// Emit a loop to decrement SP until it is equal to TargetReg, with probes at +/// least every NegProbeSize bytes. Returns an iterator of the first +/// instruction after the loop. The difference between SP and TargetReg must be +/// an exact multiple of NegProbeSize. + +static MachineBasicBlock::iterator inlineStackProbeLoopExactMultiple( + MachineFunction &MF, MachineBasicBlock::iterator MBBI, int64_t NegProbeSize, + Register TargetReg, const TargetInstrInfo &TII, + const ThumbRegisterInfo *RegInfo) { + MachineBasicBlock &MBB = *MBBI->getParent(); + DebugLoc DL = MBB.findDebugLoc(MBBI); + + MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator()); + MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, LoopMBB); + MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, ExitMBB); + + // ADD SP, SP, #NegFrameSize (or equivalent if NegFrameSize is not encodable + // in ADD). + auto loopMBBend = LoopMBB->end(); + emitCallSPUpdate(*LoopMBB, loopMBBend, TII, DL, *RegInfo, NegProbeSize); + + // STR TargetReg, [SP, #StackClashCallerGuard] + BuildMI(*LoopMBB, LoopMBB->end(), DL, TII.get(ARM::tSTRi)) + .addReg(TargetReg) + .addReg(ARM::SP) + .addImm(ARM::StackClashCallerGuard / 4) + .add(predOps(ARMCC::AL)); + + // CMP SP, TargetReg + BuildMI(*LoopMBB, LoopMBB->end(), DL, TII.get(ARM::tCMPr)) + .addReg(ARM::SP) + .addReg(TargetReg) + .add(predOps(ARMCC::AL)); + + // B.CC Loop + BuildMI(*LoopMBB, LoopMBB->end(), DL, TII.get(ARM::tBcc)) + .addMBB(LoopMBB) + .addImm(ARMCC::NE) + .addReg(ARM::CPSR); + + LoopMBB->addSuccessor(ExitMBB); + LoopMBB->addSuccessor(LoopMBB); + // Synthesize the exit MBB. + ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end()); + ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB); + MBB.addSuccessor(LoopMBB); + // Update liveins. + recomputeLiveIns(*LoopMBB); + recomputeLiveIns(*ExitMBB); + + return ExitMBB->begin(); +} + +MachineBasicBlock::iterator Thumb1FrameLowering::inlineStackProbeFixed( + MachineFunction &MF, MachineBasicBlock::iterator MBBI) const { + MachineBasicBlock &MBB = *MBBI->getParent(); + const ARMTargetLowering *TLI = + MF.getSubtarget().getTargetLowering(); + const Thumb1InstrInfo &TII = + *static_cast(STI.getInstrInfo()); + const ThumbRegisterInfo *RegInfo = + static_cast(STI.getRegisterInfo()); + + DebugLoc DL = MBB.findDebugLoc(MBBI); + Register ScratchReg = MBBI->getOperand(0).getReg(); + int64_t NegFrameSize = MBBI->getOperand(1).getImm(); + int64_t NegProbeSize = -(int64_t)TLI->getStackProbeSize(MF); + int64_t NumBlocks = NegFrameSize / NegProbeSize; + int64_t NegResidualSize = NegFrameSize % NegProbeSize; + bool NeedResidualProbe = + NegResidualSize <= -(int64_t)TLI->getStackProbeMaxUnprobedStack(MF); + bool UnrollProbeLoop = NumBlocks <= ARM::StackClashCallerMaxUnrollPage; + + MachineBasicBlock::iterator NextInst; + if (UnrollProbeLoop) { + for (int i = 0; i < NumBlocks; ++i) { + emitCallSPUpdate(MBB, MBBI, TII, DL, *RegInfo, NegProbeSize); + // STR ScratchReg, [SP, #StackClashCallerGuard] + BuildMI(MBB, MBBI, DL, TII.get(ARM::tSTRi)) + .addReg(ScratchReg) + .addReg(ARM::SP) + .addImm(ARM::StackClashCallerGuard / 4) + .add(predOps(ARMCC::AL)); + } + NextInst = std::next(MBBI); + } else if (NumBlocks != 0) { + // ADD ScratchReg, SP, #NegFrameSize (or equivalent if NegFrameSize is not + // encodable in ADD). + // ADD ScratchReg, SP, 0 + // ADD ScratchReg, NegFrameSize + + emitThumbRegPlusImmediate(MBB, MBBI, DL, ScratchReg, ARM::SP, 0, TII, + *RegInfo, MachineInstr::NoFlags); + emitThumbRegPlusImmediate(MBB, MBBI, DL, ScratchReg, ScratchReg, + NegFrameSize, TII, *RegInfo, + MachineInstr::NoFlags); + + NextInst = inlineStackProbeLoopExactMultiple(MF, MBBI, NegProbeSize, + ScratchReg, TII, RegInfo); + } + + if (NegResidualSize != 0) { + // ADD SP, SP, #NegFrameSize (or equivalent if NegFrameSize is not encodable + // in ADD). + emitCallSPUpdate(MBB, MBBI, TII, DL, *RegInfo, NegProbeSize); + if (NeedResidualProbe) { + // STR ScratchReg, [SP, #StackClashCallerGuard] + BuildMI(MBB, MBBI, DL, TII.get(ARM::tSTRi)) + .addReg(ScratchReg) + .addReg(ARM::SP) + .addImm(ARM::StackClashCallerGuard / 4) + .add(predOps(ARMCC::AL)); + } + } + + MBBI->eraseFromParent(); + return NextInst; +} + +MachineBasicBlock::iterator +Thumb1FrameLowering::insertStackProbingLoop(MachineBasicBlock::iterator MBBI, + Register TargetReg) const { + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineFunction &MF = *MBB.getParent(); + const ARMTargetLowering *TLI = + MF.getSubtarget().getTargetLowering(); + const Thumb1InstrInfo &TII = + *static_cast(STI.getInstrInfo()); + const ThumbRegisterInfo *RegInfo = + static_cast(STI.getRegisterInfo()); + + int64_t NegProbeSize = -(int64_t)TLI->getStackProbeSize(MF); + DebugLoc DL = MBB.findDebugLoc(MBBI); + + MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator()); + MachineBasicBlock *LoopTestMBB = + MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, LoopTestMBB); + MachineBasicBlock *LoopBodyMBB = + MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, LoopBodyMBB); + MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, ExitMBB); + + auto loopTestMBBItr = LoopTestMBB->end(); + unsigned ScratchRegister = ARM::NoRegister; + MachineFrameInfo &MFI = MF.getFrameInfo(); + bool HasFP = hasFP(MF); + Register FramePtr = RegInfo->getFrameRegister(MF); + const std::vector &CSI = MFI.getCalleeSavedInfo(); + for (auto &I : CSI) { + Register Reg = I.getReg(); + if (isARMLowRegister(Reg) && !(HasFP && Reg == FramePtr)) { + ScratchRegister = Reg; + break; + } + } + + // LoopTest: + // SUB SP, SP, #ProbeSize + emitPrologueEpilogueSPUpdate(*LoopTestMBB, loopTestMBBItr, TII, DL, *RegInfo, + NegProbeSize, ScratchRegister, + MachineInstr::NoFlags); + + // CMP SP, TargetReg + BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII.get(ARM::tCMPr)) + .addReg(ARM::SP) + .addReg(TargetReg) + .add(predOps(ARMCC::AL)); + + // B.LE LoopExit + BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII.get(ARM::tBcc)) + .addMBB(ExitMBB) + .addImm(ARMCC::LE) + .addReg(ARM::CPSR); + + // STR TargetReg, [SP, #StackClashCallerGuard] + BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII.get(ARM::tSTRi)) + .addReg(TargetReg) + .addReg(ARM::SP) + .addImm(ARM::StackClashCallerGuard / 4) + .add(predOps(ARMCC::AL)); + + // B loop + BuildMI(LoopBodyMBB, DL, TII.get(ARM::tB)) + .addMBB(LoopTestMBB) + .add(predOps(ARMCC::AL)); + + // LoopExit: + // MOV SP, TargetReg + BuildMI(*ExitMBB, ExitMBB->end(), DL, TII.get(ARM::tMOVr), ARM::SP) + .addReg(TargetReg) + .add(predOps(ARMCC::AL)); + + // STR TargetReg, [SP, #StackClashCallerGuard] + BuildMI(*ExitMBB, ExitMBB->end(), DL, TII.get(ARM::tSTRi)) + .addReg(TargetReg) + .addReg(ARM::SP) + .addImm(ARM::StackClashCallerGuard / 4) + .add(predOps(ARMCC::AL)); + + LoopTestMBB->addSuccessor(ExitMBB); + LoopTestMBB->addSuccessor(LoopBodyMBB); + LoopBodyMBB->addSuccessor(LoopTestMBB); + + // Synthesize the exit MBB. + ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end()); + ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB); + MBB.addSuccessor(LoopTestMBB); + + // Update liveins. + if (MF.getRegInfo().reservedRegsFrozen()) { + recomputeLiveIns(*LoopTestMBB); + recomputeLiveIns(*LoopBodyMBB); + recomputeLiveIns(*ExitMBB); + } + + return ExitMBB->begin(); +} + +MachineBasicBlock::iterator Thumb1FrameLowering::inlineStackProbeVar( + MachineFunction &MF, MachineBasicBlock::iterator MBBI) const { + MachineBasicBlock &MBB = *MBBI->getParent(); + + DebugLoc DL = MBB.findDebugLoc(MBBI); + Register TargetReg = MBBI->getOperand(0).getReg(); + MachineBasicBlock::iterator NextInst = std::next(MBBI); + + NextInst = insertStackProbingLoop(MBBI, TargetReg); + + MBBI->eraseFromParent(); + return NextInst; +} + +void Thumb1FrameLowering::inlineStackProbe(MachineFunction &MF, + MachineBasicBlock &MBB) const { + for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E;) { + if (MBBI->getOpcode() == ARM::PROBED_STACKALLOC) { + MBBI = inlineStackProbeFixed(MF, MBBI); + E = MBBI->getParent()->end(); + } else if (MBBI->getOpcode() == ARM::PROBED_STACKALLOC_VAR) { + MBBI = inlineStackProbeVar(MF, MBBI); + E = MBBI->getParent()->end(); + } else { + ++MBBI; + } + } +} diff --git a/llvm/test/CodeGen/ARM/stackProbing_arm.ll b/llvm/test/CodeGen/ARM/stackProbing_arm.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/stackProbing_arm.ll @@ -0,0 +1,191 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple arm-eabi-linux < %s -verify-machineinstrs | FileCheck %s + +; Function Attrs: noinline nounwind optnone +define dso_local void @large_stack() "probe-stack"="inline-asm" "frame-pointer"="none"{ +; CHECK-LABEL: large_stack: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5} +; CHECK-NEXT: push {r4, r5} +; CHECK-NEXT: sub r0, sp, #132 +; CHECK-NEXT: sub r0, r0, #79872 +; CHECK-NEXT: .pad #132 +; CHECK-NEXT: sub sp, sp, #132 +; CHECK-NEXT: .pad #2048 +; CHECK-NEXT: sub sp, sp, #2048 +; CHECK-NEXT: .LBB0_1: @ %entry +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: .pad #4096 +; CHECK-NEXT: sub sp, sp, #4096 +; CHECK-NEXT: cmp sp, r0 +; CHECK-NEXT: str r0, [sp, #1024] +; CHECK-NEXT: bne .LBB0_1 +; CHECK-NEXT: @ %bb.2: @ %entry +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: add r1, sp, #4 +; CHECK-NEXT: str r0, [sp] +; CHECK-NEXT: mov r0, #31 +; CHECK-NEXT: orr r0, r0, #19968 +; CHECK-NEXT: .LBB0_3: @ %for.cond +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr r2, [sp] +; CHECK-NEXT: cmp r2, r0 +; CHECK-NEXT: bhi .LBB0_5 +; CHECK-NEXT: @ %bb.4: @ %for.body +; CHECK-NEXT: @ in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: ldr r2, [sp] +; CHECK-NEXT: ldr r3, [sp] +; CHECK-NEXT: str r2, [r1, r3, lsl #2] +; CHECK-NEXT: ldr r2, [sp] +; CHECK-NEXT: add r2, r2, #1 +; CHECK-NEXT: str r2, [sp] +; CHECK-NEXT: b .LBB0_3 +; CHECK-NEXT: .LBB0_5: @ %for.end +; CHECK-NEXT: add sp, sp, #132 +; CHECK-NEXT: add sp, sp, #79872 +; CHECK-NEXT: pop {r4, r5} +; CHECK-NEXT: mov pc, lr +entry: + %stack = alloca [20000 x i32], align 4 + %i = alloca i32, align 4 + store volatile i32 0, ptr %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load volatile i32, ptr %i, align 4 + %cmp = icmp ult i32 %0, 20000 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = load volatile i32, ptr %i, align 4 + %2 = load volatile i32, ptr %i, align 4 + %arrayidx = getelementptr inbounds [20000 x i32], ptr %stack, i32 0, i32 %2 + store volatile i32 %1, ptr %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %3 = load volatile i32, ptr %i, align 4 + %inc = add nsw i32 %3, 1 + store volatile i32 %inc, ptr %i, align 4 + br label %for.cond, !llvm.loop !6 + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: noinline nounwind optnone +define dso_local void @vla(i32 noundef %n) "probe-stack"="inline-asm" "frame-pointer"="none"{ +; CHECK-LABEL: vla: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r11} +; CHECK-NEXT: push {r4, r5, r11} +; CHECK-NEXT: .setfp r11, sp, #8 +; CHECK-NEXT: add r11, sp, #8 +; CHECK-NEXT: .pad #12 +; CHECK-NEXT: sub sp, sp, #12 +; CHECK-NEXT: mov r1, #7 +; CHECK-NEXT: mov r2, sp +; CHECK-NEXT: add r1, r1, r0, lsl #2 +; CHECK-NEXT: str r0, [r11, #-12] +; CHECK-NEXT: bic r1, r1, #7 +; CHECK-NEXT: str sp, [r11, #-16] +; CHECK-NEXT: sub r3, r2, r1 +; CHECK-NEXT: .LBB1_1: @ %entry +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #4096 +; CHECK-NEXT: cmp sp, r3 +; CHECK-NEXT: ble .LBB1_3 +; CHECK-NEXT: @ %bb.2: @ %entry +; CHECK-NEXT: @ in Loop: Header=BB1_1 Depth=1 +; CHECK-NEXT: str r3, [sp, #1024] +; CHECK-NEXT: b .LBB1_1 +; CHECK-NEXT: .LBB1_3: @ %entry +; CHECK-NEXT: mov sp, r3 +; CHECK-NEXT: str r3, [sp, #1024] +; CHECK-NEXT: mov r3, #0 +; CHECK-NEXT: strb r3, [r2, -r1] +; CHECK-NEXT: str r0, [r11, #-20] +; CHECK-NEXT: ldr sp, [r11, #-16] +; CHECK-NEXT: sub sp, r11, #8 +; CHECK-NEXT: pop {r4, r5, r11} +; CHECK-NEXT: mov pc, lr +entry: + %n.addr = alloca i32, align 4 + %saved_stack = alloca ptr, align 4 + %__vla_expr0 = alloca i32, align 4 + store i32 %n, ptr %n.addr, align 4 + %0 = load i32, ptr %n.addr, align 4 + %1 = call ptr @llvm.stacksave() + store ptr %1, ptr %saved_stack, align 4 + %vla = alloca i32, i32 %0, align 4 + store i32 %0, ptr %__vla_expr0, align 4 + %arrayidx = getelementptr inbounds i32, ptr %vla, i32 0 + call void @llvm.memset.p0.i32(ptr align 4 %arrayidx, i8 0, i32 1, i1 false) + %2 = load ptr, ptr %saved_stack, align 4 + call void @llvm.stackrestore(ptr %2) + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn +declare ptr @llvm.stacksave() #1 + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i32(ptr nocapture writeonly, i8, i32, i1 immarg) #2 + +; Function Attrs: nocallback nofree nosync nounwind willreturn +declare void @llvm.stackrestore(ptr) #1 + +; Function Attrs: noinline nounwind optnone +define dso_local void @builtin_alloca(i32 noundef %n) "probe-stack"="inline-asm" "frame-pointer"="none"{ +; CHECK-LABEL: builtin_alloca: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r11} +; CHECK-NEXT: push {r4, r5, r11} +; CHECK-NEXT: .setfp r11, sp, #8 +; CHECK-NEXT: add r11, sp, #8 +; CHECK-NEXT: .pad #12 +; CHECK-NEXT: sub sp, sp, #12 +; CHECK-NEXT: str r0, [r11, #-12] +; CHECK-NEXT: add r0, r0, #7 +; CHECK-NEXT: bic r0, r0, #7 +; CHECK-NEXT: sub r0, sp, r0 +; CHECK-NEXT: .LBB2_1: @ %entry +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #4096 +; CHECK-NEXT: cmp sp, r0 +; CHECK-NEXT: ble .LBB2_3 +; CHECK-NEXT: @ %bb.2: @ %entry +; CHECK-NEXT: @ in Loop: Header=BB2_1 Depth=1 +; CHECK-NEXT: str r0, [sp, #1024] +; CHECK-NEXT: b .LBB2_1 +; CHECK-NEXT: .LBB2_3: @ %entry +; CHECK-NEXT: mov sp, r0 +; CHECK-NEXT: str r0, [sp, #1024] +; CHECK-NEXT: str r0, [r11, #-16] +; CHECK-NEXT: sub sp, r11, #8 +; CHECK-NEXT: pop {r4, r5, r11} +; CHECK-NEXT: mov pc, lr +entry: + %n.addr = alloca i32, align 4 + %mem = alloca ptr, align 4 + store i32 %n, ptr %n.addr, align 4 + %0 = load i32, ptr %n.addr, align 4 + %1 = alloca i8, i32 %0, align 8 + store ptr %1, ptr %mem, align 4 + ret void +} + +attributes #1 = { nocallback nofree nosync nounwind willreturn } +attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) } + +!llvm.module.flags = !{!0, !1, !2, !3, !4} +!llvm.ident = !{!5} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, !"min_enum_size", i32 4} +!2 = !{i32 8, !"PIC Level", i32 2} +!3 = !{i32 7, !"PIE Level", i32 2} +!4 = !{i32 7, !"frame-pointer", i32 2} +!5 = !{!"clang version 17.0.0 (https://github.com/llvm/llvm-project a1677bda7975a0f690292587a04b9e053aacd1dc)"} +!6 = distinct !{!6, !7} +!7 = !{!"llvm.loop.mustprogress"} diff --git a/llvm/test/CodeGen/ARM/stackProbing_thumb.ll b/llvm/test/CodeGen/ARM/stackProbing_thumb.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/stackProbing_thumb.ll @@ -0,0 +1,216 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=thumb-eabi -mcpu=cortex-m0 -o - | FileCheck %s + +; Function Attrs: noinline nounwind optnone +define dso_local void @large_stack() "probe-stack"="inline-asm" "frame-pointer"="none" { +; CHECK-LABEL: large_stack: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: ldr r0, .LCPI0_1 +; CHECK-NEXT: subs r0, r0, r0 +; CHECK-NEXT: ldr r1, .LCPI0_2 +; CHECK-NEXT: add sp, r1 +; CHECK-NEXT: .LBB0_1: @ %entry +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr r1, .LCPI0_2 +; CHECK-NEXT: add sp, r1 +; CHECK-NEXT: str r0, [sp, #1024] +; CHECK-NEXT: cmp sp, r0 +; CHECK-NEXT: bne .LBB0_1 +; CHECK-NEXT: @ %bb.2: @ %entry +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: str r0, [sp] +; CHECK-NEXT: ldr r0, .LCPI0_0 +; CHECK-NEXT: .LBB0_3: @ %for.cond +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr r1, [sp] +; CHECK-NEXT: cmp r1, r0 +; CHECK-NEXT: bhi .LBB0_5 +; CHECK-NEXT: @ %bb.4: @ %for.body +; CHECK-NEXT: @ in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: ldr r1, [sp] +; CHECK-NEXT: ldr r2, [sp] +; CHECK-NEXT: lsls r2, r2, #2 +; CHECK-NEXT: add r3, sp, #4 +; CHECK-NEXT: str r1, [r3, r2] +; CHECK-NEXT: ldr r1, [sp] +; CHECK-NEXT: adds r1, r1, #1 +; CHECK-NEXT: str r1, [sp] +; CHECK-NEXT: b .LBB0_3 +; CHECK-NEXT: .LBB0_5: @ %for.end +; CHECK-NEXT: ldr r6, .LCPI0_1 +; CHECK-NEXT: add sp, r6 +; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.6: +; CHECK-NEXT: .LCPI0_0: +; CHECK-NEXT: .long 19999 @ 0x4e1f +; CHECK-NEXT: .LCPI0_1: +; CHECK-NEXT: .long 80004 @ 0x13884 +; CHECK-NEXT: .LCPI0_2: +; CHECK-NEXT: .long 4294963200 @ 0xfffff000 +entry: + %stack = alloca [20000 x i32], align 4 + %i = alloca i32, align 4 + store volatile i32 0, ptr %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load volatile i32, ptr %i, align 4 + %cmp = icmp ult i32 %0, 20000 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = load volatile i32, ptr %i, align 4 + %2 = load volatile i32, ptr %i, align 4 + %arrayidx = getelementptr inbounds [20000 x i32], ptr %stack, i32 0, i32 %2 + store volatile i32 %1, ptr %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %3 = load volatile i32, ptr %i, align 4 + %inc = add nsw i32 %3, 1 + store volatile i32 %inc, ptr %i, align 4 + br label %for.cond, !llvm.loop !4 + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: noinline nounwind optnone +define dso_local void @vla(i32 noundef %n) "probe-stack"="inline-asm" "frame-pointer"="none"{ +; CHECK-LABEL: vla: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r6, r7, lr} +; CHECK-NEXT: push {r4, r6, r7, lr} +; CHECK-NEXT: .setfp r7, sp, #8 +; CHECK-NEXT: add r7, sp, #8 +; CHECK-NEXT: ldr r1, .LCPI1_0 +; CHECK-NEXT: add sp, r1 +; CHECK-NEXT: mov r6, sp +; CHECK-NEXT: adds r1, r6, #4 +; CHECK-NEXT: str r0, [r1, #8] +; CHECK-NEXT: mov r2, sp +; CHECK-NEXT: str r2, [r1, #4] +; CHECK-NEXT: lsls r2, r0, #2 +; CHECK-NEXT: adds r2, r2, #7 +; CHECK-NEXT: movs r3, #7 +; CHECK-NEXT: bics r2, r3 +; CHECK-NEXT: mov r3, sp +; CHECK-NEXT: subs r2, r3, r2 +; CHECK-NEXT: .LBB1_1: @ %entry +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr r6, .LCPI1_0 +; CHECK-NEXT: add sp, r6 +; CHECK-NEXT: cmp sp, r2 +; CHECK-NEXT: ble .LBB1_3 +; CHECK-NEXT: @ %bb.2: @ %entry +; CHECK-NEXT: @ in Loop: Header=BB1_1 Depth=1 +; CHECK-NEXT: str r2, [sp, #1024] +; CHECK-NEXT: b .LBB1_1 +; CHECK-NEXT: .LBB1_3: @ %entry +; CHECK-NEXT: mov sp, r2 +; CHECK-NEXT: str r2, [sp, #1024] +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: strb r3, [r2] +; CHECK-NEXT: str r0, [r1] +; CHECK-NEXT: ldr r0, [r1, #4] +; CHECK-NEXT: mov sp, r0 +; CHECK-NEXT: subs r4, r7, #7 +; CHECK-NEXT: subs r4, #1 +; CHECK-NEXT: mov sp, r4 +; CHECK-NEXT: pop {r4, r6, r7, pc} +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.4: +; CHECK-NEXT: .LCPI1_0: +; CHECK-NEXT: .long 4294963200 @ 0xfffff000 +entry: + %n.addr = alloca i32, align 4 + %saved_stack = alloca ptr, align 4 + %__vla_expr0 = alloca i32, align 4 + store i32 %n, ptr %n.addr, align 4 + %0 = load i32, ptr %n.addr, align 4 + %1 = call ptr @llvm.stacksave() + store ptr %1, ptr %saved_stack, align 4 + %vla = alloca i32, i32 %0, align 4 + store i32 %0, ptr %__vla_expr0, align 4 + %arrayidx = getelementptr inbounds i32, ptr %vla, i32 0 + call void @llvm.memset.p0.i32(ptr align 4 %arrayidx, i8 0, i32 1, i1 false) + %2 = load ptr, ptr %saved_stack, align 4 + call void @llvm.stackrestore(ptr %2) + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn +declare ptr @llvm.stacksave() #1 + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i32(ptr nocapture writeonly, i8, i32, i1 immarg) #2 + +; Function Attrs: nocallback nofree nosync nounwind willreturn +declare void @llvm.stackrestore(ptr) #1 + +; Function Attrs: noinline nounwind optnone +define dso_local void @builtin_alloca(i32 noundef %n) "probe-stack"="inline-asm" "frame-pointer"="none"{ +; CHECK-LABEL: builtin_alloca: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r6, r7, lr} +; CHECK-NEXT: push {r4, r6, r7, lr} +; CHECK-NEXT: .setfp r7, sp, #8 +; CHECK-NEXT: add r7, sp, #8 +; CHECK-NEXT: ldr r1, .LCPI2_0 +; CHECK-NEXT: add sp, r1 +; CHECK-NEXT: mov r6, sp +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: str r0, [r1, #4] +; CHECK-NEXT: adds r0, r0, #7 +; CHECK-NEXT: movs r2, #7 +; CHECK-NEXT: bics r0, r2 +; CHECK-NEXT: mov r2, sp +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: .LBB2_1: @ %entry +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr r6, .LCPI2_0 +; CHECK-NEXT: add sp, r6 +; CHECK-NEXT: cmp sp, r0 +; CHECK-NEXT: ble .LBB2_3 +; CHECK-NEXT: @ %bb.2: @ %entry +; CHECK-NEXT: @ in Loop: Header=BB2_1 Depth=1 +; CHECK-NEXT: str r0, [sp, #1024] +; CHECK-NEXT: b .LBB2_1 +; CHECK-NEXT: .LBB2_3: @ %entry +; CHECK-NEXT: mov sp, r0 +; CHECK-NEXT: str r0, [sp, #1024] +; CHECK-NEXT: str r0, [r1] +; CHECK-NEXT: subs r4, r7, #7 +; CHECK-NEXT: subs r4, #1 +; CHECK-NEXT: mov sp, r4 +; CHECK-NEXT: pop {r4, r6, r7, pc} +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.4: +; CHECK-NEXT: .LCPI2_0: +; CHECK-NEXT: .long 4294963200 @ 0xfffff000 +entry: + %n.addr = alloca i32, align 4 + %mem = alloca ptr, align 4 + store i32 %n, ptr %n.addr, align 4 + %0 = load i32, ptr %n.addr, align 4 + %1 = alloca i8, i32 %0, align 8 + store ptr %1, ptr %mem, align 4 + ret void +} + +attributes #1 = { nocallback nofree nosync nounwind willreturn } +attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) } + +!llvm.module.flags = !{!0, !1, !2} +!llvm.ident = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, !"min_enum_size", i32 4} +!2 = !{i32 7, !"frame-pointer", i32 2} +!3 = !{!"clang version 17.0.0 (https://github.com/llvm/llvm-project a1677bda7975a0f690292587a04b9e053aacd1dc)"} +!4 = distinct !{!4, !5} +!5 = !{!"llvm.loop.mustprogress"} diff --git a/llvm/test/CodeGen/Thumb2/stackProbing_thumb2.ll b/llvm/test/CodeGen/Thumb2/stackProbing_thumb2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/stackProbing_thumb2.ll @@ -0,0 +1,188 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple thumbv8.1m.main-none-linux-eabi < %s -verify-machineinstrs | FileCheck %s +; Function Attrs: noinline nounwind optnone +define dso_local void @large_stack() "probe-stack"="inline-asm" "frame-pointer"="none" { +; CHECK-LABEL: large_stack: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5} +; CHECK-NEXT: push {r4, r5} +; CHECK-NEXT: sub.w r0, sp, #79872 +; CHECK-NEXT: subs r0, #132 +; CHECK-NEXT: .pad #2180 +; CHECK-NEXT: subw sp, sp, #2180 +; CHECK-NEXT: .LBB0_1: @ %entry +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: .pad #4096 +; CHECK-NEXT: sub.w sp, sp, #4096 +; CHECK-NEXT: cmp sp, r0 +; CHECK-NEXT: str.w r0, [sp, #1024] +; CHECK-NEXT: bne .LBB0_1 +; CHECK-NEXT: @ %bb.2: @ %entry +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: add r1, sp, #4 +; CHECK-NEXT: str r0, [sp] +; CHECK-NEXT: movw r0, #19999 +; CHECK-NEXT: .LBB0_3: @ %for.cond +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr r2, [sp] +; CHECK-NEXT: cmp r2, r0 +; CHECK-NEXT: bhi .LBB0_5 +; CHECK-NEXT: @ %bb.4: @ %for.body +; CHECK-NEXT: @ in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: ldr r2, [sp] +; CHECK-NEXT: ldr r3, [sp] +; CHECK-NEXT: str.w r2, [r1, r3, lsl #2] +; CHECK-NEXT: ldr r2, [sp] +; CHECK-NEXT: adds r2, #1 +; CHECK-NEXT: str r2, [sp] +; CHECK-NEXT: b .LBB0_3 +; CHECK-NEXT: .LBB0_5: @ %for.end +; CHECK-NEXT: add.w sp, sp, #79872 +; CHECK-NEXT: add sp, #132 +; CHECK-NEXT: pop {r4, r5} +; CHECK-NEXT: bx lr +entry: + %stack = alloca [20000 x i32], align 4 + %i = alloca i32, align 4 + store volatile i32 0, ptr %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load volatile i32, ptr %i, align 4 + %cmp = icmp ult i32 %0, 20000 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = load volatile i32, ptr %i, align 4 + %2 = load volatile i32, ptr %i, align 4 + %arrayidx = getelementptr inbounds [20000 x i32], ptr %stack, i32 0, i32 %2 + store volatile i32 %1, ptr %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %3 = load volatile i32, ptr %i, align 4 + %inc = add nsw i32 %3, 1 + store volatile i32 %inc, ptr %i, align 4 + br label %for.cond, !llvm.loop !6 + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: noinline nounwind optnone +define dso_local void @vla(i32 noundef %n) "probe-stack"="inline-asm" "frame-pointer"="none" { +; CHECK-LABEL: vla: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r6, r7, lr} +; CHECK-NEXT: push {r4, r6, r7, lr} +; CHECK-NEXT: .setfp r7, sp, #8 +; CHECK-NEXT: add r7, sp, #8 +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: movs r1, #7 +; CHECK-NEXT: str r0, [r7, #-12] +; CHECK-NEXT: add.w r1, r1, r0, lsl #2 +; CHECK-NEXT: str sp, [r7, #-16] +; CHECK-NEXT: bic r1, r1, #7 +; CHECK-NEXT: sub.w r1, sp, r1 +; CHECK-NEXT: .LBB1_1: @ %entry +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: .pad #4096 +; CHECK-NEXT: sub.w sp, sp, #4096 +; CHECK-NEXT: cmp sp, r1 +; CHECK-NEXT: ble .LBB1_3 +; CHECK-NEXT: @ %bb.2: @ %entry +; CHECK-NEXT: @ in Loop: Header=BB1_1 Depth=1 +; CHECK-NEXT: str.w r1, [sp, #1024] +; CHECK-NEXT: b .LBB1_1 +; CHECK-NEXT: .LBB1_3: @ %entry +; CHECK-NEXT: mov sp, r1 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: str.w r1, [sp, #1024] +; CHECK-NEXT: strb r2, [r1] +; CHECK-NEXT: str r0, [r7, #-20] +; CHECK-NEXT: ldr sp, [r7, #-16] +; CHECK-NEXT: sub.w r4, r7, #8 +; CHECK-NEXT: mov sp, r4 +; CHECK-NEXT: pop {r4, r6, r7, pc} +entry: + %n.addr = alloca i32, align 4 + %saved_stack = alloca ptr, align 4 + %__vla_expr0 = alloca i32, align 4 + store i32 %n, ptr %n.addr, align 4 + %0 = load i32, ptr %n.addr, align 4 + %1 = call ptr @llvm.stacksave() + store ptr %1, ptr %saved_stack, align 4 + %vla = alloca i32, i32 %0, align 4 + store i32 %0, ptr %__vla_expr0, align 4 + %arrayidx = getelementptr inbounds i32, ptr %vla, i32 0 + call void @llvm.memset.p0.i32(ptr align 4 %arrayidx, i8 0, i32 1, i1 false) + %2 = load ptr, ptr %saved_stack, align 4 + call void @llvm.stackrestore(ptr %2) + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn +declare ptr @llvm.stacksave() #1 + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i32(ptr nocapture writeonly, i8, i32, i1 immarg) #2 + +; Function Attrs: nocallback nofree nosync nounwind willreturn +declare void @llvm.stackrestore(ptr) #1 + +; Function Attrs: noinline nounwind optnone +define dso_local void @builtin_alloca(i32 noundef %n) "probe-stack"="inline-asm" "frame-pointer"="none" { +; CHECK-LABEL: builtin_alloca: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r6, r7, lr} +; CHECK-NEXT: push {r4, r6, r7, lr} +; CHECK-NEXT: .setfp r7, sp, #8 +; CHECK-NEXT: add r7, sp, #8 +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: str r0, [r7, #-12] +; CHECK-NEXT: adds r0, #7 +; CHECK-NEXT: bic r0, r0, #7 +; CHECK-NEXT: sub.w r0, sp, r0 +; CHECK-NEXT: .LBB2_1: @ %entry +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: .pad #4096 +; CHECK-NEXT: sub.w sp, sp, #4096 +; CHECK-NEXT: cmp sp, r0 +; CHECK-NEXT: ble .LBB2_3 +; CHECK-NEXT: @ %bb.2: @ %entry +; CHECK-NEXT: @ in Loop: Header=BB2_1 Depth=1 +; CHECK-NEXT: str.w r0, [sp, #1024] +; CHECK-NEXT: b .LBB2_1 +; CHECK-NEXT: .LBB2_3: @ %entry +; CHECK-NEXT: mov sp, r0 +; CHECK-NEXT: sub.w r4, r7, #8 +; CHECK-NEXT: str.w r0, [sp, #1024] +; CHECK-NEXT: str r0, [r7, #-16] +; CHECK-NEXT: mov sp, r4 +; CHECK-NEXT: pop {r4, r6, r7, pc} +entry: + %n.addr = alloca i32, align 4 + %mem = alloca ptr, align 4 + store i32 %n, ptr %n.addr, align 4 + %0 = load i32, ptr %n.addr, align 4 + %1 = alloca i8, i32 %0, align 8 + store ptr %1, ptr %mem, align 4 + ret void +} + +attributes #1 = { nocallback nofree nosync nounwind willreturn } +attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) } + +!llvm.module.flags = !{!0, !1, !2, !3, !4} +!llvm.ident = !{!5} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, !"min_enum_size", i32 4} +!2 = !{i32 8, !"PIC Level", i32 2} +!3 = !{i32 7, !"PIE Level", i32 2} +!4 = !{i32 7, !"frame-pointer", i32 2} +!5 = !{!"clang version 17.0.0 (https://github.com/llvm/llvm-project a1677bda7975a0f690292587a04b9e053aacd1dc)"} +!6 = distinct !{!6, !7} +!7 = !{!"llvm.loop.mustprogress"}