diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.h b/llvm/lib/Target/PowerPC/PPCFrameLowering.h --- a/llvm/lib/Target/PowerPC/PPCFrameLowering.h +++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.h @@ -100,6 +100,8 @@ /// the function. void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + void inlineStackProbe(MachineFunction &MF, + MachineBasicBlock &PrologMBB) const override; bool hasFP(const MachineFunction &MF) const override; bool needsFP(const MachineFunction &MF) const; diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -10,6 +10,7 @@ // //===----------------------------------------------------------------------===// +#include "MCTargetDesc/PPCPredicates.h" #include "PPCFrameLowering.h" #include "PPCInstrBuilder.h" #include "PPCInstrInfo.h" @@ -31,6 +32,7 @@ #define DEBUG_TYPE "framelowering" STATISTIC(NumPESpillVSR, "Number of spills to vector in prologue"); STATISTIC(NumPEReloadVSR, "Number of reloads from vector in epilogue"); +STATISTIC(NumPrologProbed, "Number of prologues probed"); static cl::opt EnablePEVectorSpills("ppc-enable-pe-vector-spills", @@ -757,6 +759,7 @@ MachineFrameInfo &MFI = MF.getFrameInfo(); const PPCInstrInfo &TII = *Subtarget.getInstrInfo(); const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + const PPCTargetLowering &TLI = *Subtarget.getTargetLowering(); MachineModuleInfo &MMI = MF.getMMI(); const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); @@ -845,9 +848,12 @@ "FrameSize must be >0 to save/restore the FP or LR for 32-bit SVR4."); // Using the same bool variable as below to suppress compiler warnings. - bool SingleScratchReg = - findScratchRegister(&MBB, false, twoUniqueScratchRegsRequired(&MBB), - &ScratchReg, &TempReg); + // Stack probe requires two scratch registers, one for old sp, one for large + // frame and large probe size. + bool SingleScratchReg = findScratchRegister( + &MBB, false, + twoUniqueScratchRegsRequired(&MBB) || TLI.hasInlineStackProbe(MF), + &ScratchReg, &TempReg); assert(SingleScratchReg && "Required number of registers not available in this block"); @@ -1023,58 +1029,81 @@ // the negated frame size will be placed in ScratchReg. bool HasSTUX = false; - // This condition must be kept in sync with canUseAsPrologue. - if (HasBP && MaxAlign > 1) { - if (isPPC64) - BuildMI(MBB, MBBI, dl, TII.get(PPC::RLDICL), ScratchReg) - .addReg(SPReg) - .addImm(0) - .addImm(64 - Log2(MaxAlign)); - else // PPC32... - BuildMI(MBB, MBBI, dl, TII.get(PPC::RLWINM), ScratchReg) - .addReg(SPReg) - .addImm(0) - .addImm(32 - Log2(MaxAlign)) - .addImm(31); - if (!isLargeFrame) { - BuildMI(MBB, MBBI, dl, SubtractImmCarryingInst, ScratchReg) - .addReg(ScratchReg, RegState::Kill) + // If FrameSize <= TLI.getStackProbeSize(MF), as POWER ABI requires backchain + // pointer is always stored at SP, we will get a free probe due to an essential + // STU(X) instruction. + if (TLI.hasInlineStackProbe(MF) && FrameSize > TLI.getStackProbeSize(MF)) { + // To be consistent with other targets, a pseudo instruction is emitted and + // will be later expanded in `inlineStackProbe`. + BuildMI(MBB, MBBI, dl, + TII.get(isPPC64 ? PPC::PROBED_STACKALLOC_64 + : PPC::PROBED_STACKALLOC_32)) + .addDef(ScratchReg) + .addDef(TempReg) // TempReg stores the old sp. .addImm(NegFrameSize); - } else { - assert(!SingleScratchReg && "Only a single scratch reg available"); - BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, TempReg) - .addImm(NegFrameSize >> 16); - BuildMI(MBB, MBBI, dl, OrImmInst, TempReg) - .addReg(TempReg, RegState::Kill) - .addImm(NegFrameSize & 0xFFFF); - BuildMI(MBB, MBBI, dl, SubtractCarryingInst, ScratchReg) - .addReg(ScratchReg, RegState::Kill) - .addReg(TempReg, RegState::Kill); + // FIXME: HasSTUX is only read if HasRedZone is not set, in such case, we + // update the ScratchReg to meet the assumption that ScratchReg contains + // the NegFrameSize. This solution is rather tricky. + if (!HasRedZone) { + BuildMI(MBB, MBBI, dl, TII.get(PPC::SUBF), ScratchReg) + .addReg(TempReg) + .addReg(SPReg); + HasSTUX = true; } + } else { + // This condition must be kept in sync with canUseAsPrologue. + if (HasBP && MaxAlign > 1) { + if (isPPC64) + BuildMI(MBB, MBBI, dl, TII.get(PPC::RLDICL), ScratchReg) + .addReg(SPReg) + .addImm(0) + .addImm(64 - Log2(MaxAlign)); + else // PPC32... + BuildMI(MBB, MBBI, dl, TII.get(PPC::RLWINM), ScratchReg) + .addReg(SPReg) + .addImm(0) + .addImm(32 - Log2(MaxAlign)) + .addImm(31); + if (!isLargeFrame) { + BuildMI(MBB, MBBI, dl, SubtractImmCarryingInst, ScratchReg) + .addReg(ScratchReg, RegState::Kill) + .addImm(NegFrameSize); + } else { + assert(!SingleScratchReg && "Only a single scratch reg available"); + BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, TempReg) + .addImm(NegFrameSize >> 16); + BuildMI(MBB, MBBI, dl, OrImmInst, TempReg) + .addReg(TempReg, RegState::Kill) + .addImm(NegFrameSize & 0xFFFF); + BuildMI(MBB, MBBI, dl, SubtractCarryingInst, ScratchReg) + .addReg(ScratchReg, RegState::Kill) + .addReg(TempReg, RegState::Kill); + } - BuildMI(MBB, MBBI, dl, StoreUpdtIdxInst, SPReg) - .addReg(SPReg, RegState::Kill) - .addReg(SPReg) - .addReg(ScratchReg); - HasSTUX = true; + BuildMI(MBB, MBBI, dl, StoreUpdtIdxInst, SPReg) + .addReg(SPReg, RegState::Kill) + .addReg(SPReg) + .addReg(ScratchReg); + HasSTUX = true; - } else if (!isLargeFrame) { - BuildMI(MBB, StackUpdateLoc, dl, StoreUpdtInst, SPReg) - .addReg(SPReg) - .addImm(NegFrameSize) - .addReg(SPReg); + } else if (!isLargeFrame) { + BuildMI(MBB, StackUpdateLoc, dl, StoreUpdtInst, SPReg) + .addReg(SPReg) + .addImm(NegFrameSize) + .addReg(SPReg); - } else { - BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg) - .addImm(NegFrameSize >> 16); - BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg) - .addReg(ScratchReg, RegState::Kill) - .addImm(NegFrameSize & 0xFFFF); - BuildMI(MBB, MBBI, dl, StoreUpdtIdxInst, SPReg) - .addReg(SPReg, RegState::Kill) - .addReg(SPReg) - .addReg(ScratchReg); - HasSTUX = true; + } else { + BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg) + .addImm(NegFrameSize >> 16); + BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg) + .addReg(ScratchReg, RegState::Kill) + .addImm(NegFrameSize & 0xFFFF); + BuildMI(MBB, MBBI, dl, StoreUpdtIdxInst, SPReg) + .addReg(SPReg, RegState::Kill) + .addReg(SPReg) + .addReg(ScratchReg); + HasSTUX = true; + } } // Save the TOC register after the stack pointer update if a prologue TOC @@ -1335,6 +1364,142 @@ } } +void PPCFrameLowering::inlineStackProbe(MachineFunction &MF, + MachineBasicBlock &PrologMBB) const { + // TODO: Generate CFI instructions. + bool isPPC64 = Subtarget.isPPC64(); + const PPCTargetLowering &TLI = *Subtarget.getTargetLowering(); + const PPCInstrInfo &TII = *Subtarget.getInstrInfo(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + auto StackAllocMIPos = llvm::find_if(PrologMBB, [](MachineInstr &MI) { + int Opc = MI.getOpcode(); + return Opc == PPC::PROBED_STACKALLOC_64 || Opc == PPC::PROBED_STACKALLOC_32; + }); + if (StackAllocMIPos == PrologMBB.end()) + return; + const BasicBlock *ProbedBB = PrologMBB.getBasicBlock(); + DebugLoc DL = PrologMBB.findDebugLoc(StackAllocMIPos); + MachineInstr &MI = *StackAllocMIPos; + int64_t NegFrameSize = MI.getOperand(2).getImm(); + int64_t NegProbeSize = -(int64_t)TLI.getStackProbeSize(MF); + assert(isInt<32>(NegProbeSize) && "Unhandled probe size"); + int64_t NumBlocks = NegFrameSize / NegProbeSize; + int64_t NegResidualSize = NegFrameSize % NegProbeSize; + Register SPReg = isPPC64 ? PPC::X1 : PPC::R1; + Register ScratchReg = MI.getOperand(0).getReg(); + Register FPReg = MI.getOperand(1).getReg(); + const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + bool HasBP = RegInfo->hasBasePointer(MF); + Align MaxAlign = MFI.getMaxAlign(); + // Initialize current frame pointer. + const MCInstrDesc &CopyInst = TII.get(isPPC64 ? PPC::OR8 : PPC::OR); + BuildMI(PrologMBB, {MI}, DL, CopyInst, FPReg).addReg(SPReg).addReg(SPReg); + // Subroutine to determine if we can use the Imm as part of d-form. + auto CanUseDForm = [](int64_t Imm) { return isInt<16>(Imm) && Imm % 4 == 0; }; + // Subroutine to materialize the Imm into TempReg. + auto MaterializeImm = [&](MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, int64_t Imm, + Register &TempReg) { + assert(isInt<32>(Imm) && "Unhandled imm"); + if (isInt<16>(Imm)) + BuildMI(MBB, MBBI, DL, TII.get(isPPC64 ? PPC::LI8 : PPC::LI), TempReg) + .addImm(Imm); + else { + BuildMI(MBB, MBBI, DL, TII.get(isPPC64 ? PPC::LIS8 : PPC::LIS), TempReg) + .addImm(Imm >> 16); + BuildMI(MBB, MBBI, DL, TII.get(isPPC64 ? PPC::ORI8 : PPC::ORI), TempReg) + .addReg(TempReg) + .addImm(Imm & 0xFFFF); + } + }; + // Subroutine to store frame pointer and decrease stack pointer by probe size. + auto allocateAndProbe = [&](MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, int64_t NegSize, + Register NegSizeReg, bool UseDForm) { + if (UseDForm) + BuildMI(MBB, MBBI, DL, TII.get(isPPC64 ? PPC::STDU : PPC::STWU), SPReg) + .addReg(FPReg) + .addImm(NegSize) + .addReg(SPReg); + else + BuildMI(MBB, MBBI, DL, TII.get(isPPC64 ? PPC::STDUX : PPC::STWUX), SPReg) + .addReg(FPReg) + .addReg(SPReg) + .addReg(NegSizeReg); + }; + // For case HasBP && MaxAlign > 1, we have to align the SP by performing + // SP = SP - SP % MaxAlign. + if (HasBP && MaxAlign > 1) { + if (isPPC64) + BuildMI(PrologMBB, {MI}, DL, TII.get(PPC::RLDICL), ScratchReg) + .addReg(FPReg) + .addImm(0) + .addImm(64 - Log2(MaxAlign)); + else + BuildMI(PrologMBB, {MI}, DL, TII.get(PPC::RLWINM), ScratchReg) + .addReg(FPReg) + .addImm(0) + .addImm(32 - Log2(MaxAlign)) + .addImm(31); + BuildMI(PrologMBB, {MI}, DL, TII.get(isPPC64 ? PPC::STDUX : PPC::STWUX), + SPReg) + .addReg(FPReg) + .addReg(SPReg) + .addReg(ScratchReg); + } + // Probe residual part. + if (NegResidualSize) { + bool ResidualUseDForm = CanUseDForm(NegResidualSize); + if (!ResidualUseDForm) + MaterializeImm(PrologMBB, {MI}, NegResidualSize, ScratchReg); + allocateAndProbe(PrologMBB, {MI}, NegResidualSize, ScratchReg, + ResidualUseDForm); + } + bool UseDForm = CanUseDForm(NegProbeSize); + // If number of blocks is small, just probe them directly. + if (NumBlocks < 3) { + if (!UseDForm) + MaterializeImm(PrologMBB, {MI}, NegProbeSize, ScratchReg); + for (int i = 0; i < NumBlocks; ++i) + allocateAndProbe(PrologMBB, {MI}, NegProbeSize, ScratchReg, UseDForm); + } else { + // Since CTR is a volatile register and current shrinkwrap implementation + // won't choose an MBB in a loop as the PrologMBB, it's safe to synthesize a + // CTR loop to probe. + // Calculate trip count and stores it in CTRReg. + MaterializeImm(PrologMBB, {MI}, NumBlocks, ScratchReg); + BuildMI(PrologMBB, {MI}, DL, TII.get(isPPC64 ? PPC::MTCTR8 : PPC::MTCTR)) + .addReg(ScratchReg, RegState::Kill); + if (!UseDForm) + MaterializeImm(PrologMBB, {MI}, NegProbeSize, ScratchReg); + // Create MBBs of the loop. + MachineFunction::iterator MBBInsertPoint = + std::next(PrologMBB.getIterator()); + MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(ProbedBB); + MF.insert(MBBInsertPoint, LoopMBB); + MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(ProbedBB); + MF.insert(MBBInsertPoint, ExitMBB); + // Synthesize the loop body. + allocateAndProbe(*LoopMBB, LoopMBB->end(), NegProbeSize, ScratchReg, + UseDForm); + BuildMI(LoopMBB, DL, TII.get(isPPC64 ? PPC::BDNZ8 : PPC::BDNZ)) + .addMBB(LoopMBB); + LoopMBB->addSuccessor(ExitMBB); + LoopMBB->addSuccessor(LoopMBB); + // Synthesize the exit MBB. + ExitMBB->splice(ExitMBB->end(), &PrologMBB, + std::next(MachineBasicBlock::iterator(MI)), + PrologMBB.end()); + ExitMBB->transferSuccessorsAndUpdatePHIs(&PrologMBB); + PrologMBB.addSuccessor(LoopMBB); + // Update liveins. + recomputeLiveIns(*LoopMBB); + recomputeLiveIns(*ExitMBB); + } + ++NumPrologProbed; + MI.eraseFromParent(); +} + void PPCFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td --- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -434,6 +434,9 @@ def PREPARE_PROBED_ALLOCA_64 : PPCEmitTimePseudo<(outs g8rc:$fp, g8rc:$sp), (ins g8rc:$negsize, memri:$fpsi), "#PREPARE_PROBED_ALLOCA_64", []>; +def PROBED_STACKALLOC_64 : PPCEmitTimePseudo<(outs g8rc:$scratch, g8rc:$temp), + (ins i64imm:$stacksize), + "#PROBED_STACKALLOC_64", []>; } let hasSideEffects = 0 in { diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -1409,6 +1409,9 @@ def PREPARE_PROBED_ALLOCA_32 : PPCEmitTimePseudo<(outs gprc:$fp, gprc:$sp), (ins gprc:$negsize, memri:$fpsi), "#PREPARE_PROBED_ALLOCA_32", []>; +def PROBED_STACKALLOC_32 : PPCEmitTimePseudo<(outs gprc:$scratch, gprc:$temp), + (ins i64imm:$stacksize), + "#PROBED_STACKALLOC_32", []>; } // SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded after diff --git a/llvm/test/CodeGen/PowerPC/stack-clash-prologue.ll b/llvm/test/CodeGen/PowerPC/stack-clash-prologue.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/stack-clash-prologue.ll @@ -0,0 +1,474 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -ppc-asm-full-reg-names -verify-machineinstrs \ +; RUN: -mtriple=powerpc64le-linux-gnu < %s | FileCheck \ +; RUN: -check-prefix=CHECK-LE %s +; RUN: llc -ppc-asm-full-reg-names -verify-machineinstrs \ +; RUN: -mtriple=powerpc64-linux-gnu < %s | FileCheck \ +; RUN: -check-prefix=CHECK-BE %s +; RUN: llc -ppc-asm-full-reg-names -verify-machineinstrs \ +; RUN: -mtriple=powerpc-linux-gnu < %s | FileCheck \ +; RUN: -check-prefix=CHECK-32 %s + +; Free probe +define i8 @f0() #0 nounwind { +; CHECK-LE-LABEL: f0: +; CHECK-LE: # %bb.0: # %entry +; CHECK-LE-NEXT: li r3, 3 +; CHECK-LE-NEXT: stb r3, -64(r1) +; CHECK-LE-NEXT: lbz r3, -64(r1) +; CHECK-LE-NEXT: blr +; +; CHECK-BE-LABEL: f0: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: li r3, 3 +; CHECK-BE-NEXT: stb r3, -64(r1) +; CHECK-BE-NEXT: lbz r3, -64(r1) +; CHECK-BE-NEXT: blr +; +; CHECK-32-LABEL: f0: +; CHECK-32: # %bb.0: # %entry +; CHECK-32-NEXT: stwu r1, -80(r1) +; CHECK-32-NEXT: li r3, 3 +; CHECK-32-NEXT: stb r3, 16(r1) +; CHECK-32-NEXT: lbz r3, 16(r1) +; CHECK-32-NEXT: addi r1, r1, 80 +; CHECK-32-NEXT: blr +entry: + %a = alloca i8, i64 64 + %b = getelementptr inbounds i8, i8* %a, i64 63 + store volatile i8 3, i8* %a + %c = load volatile i8, i8* %a + ret i8 %c +} + +define i8 @f1() #0 "stack-probe-size"="0" nounwind { +; CHECK-LE-LABEL: f1: +; CHECK-LE: # %bb.0: # %entry +; CHECK-LE-NEXT: mr r12, r1 +; CHECK-LE-NEXT: li r0, 259 +; CHECK-LE-NEXT: mtctr r0 +; CHECK-LE-NEXT: .LBB1_1: # %entry +; CHECK-LE-NEXT: # +; CHECK-LE-NEXT: stdu r12, -16(r1) +; CHECK-LE-NEXT: bdnz .LBB1_1 +; CHECK-LE-NEXT: # %bb.2: # %entry +; CHECK-LE-NEXT: li r3, 3 +; CHECK-LE-NEXT: stb r3, 48(r1) +; CHECK-LE-NEXT: lbz r3, 48(r1) +; CHECK-LE-NEXT: addi r1, r1, 4144 +; CHECK-LE-NEXT: blr +; +; CHECK-BE-LABEL: f1: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: mr r12, r1 +; CHECK-BE-NEXT: li r0, 260 +; CHECK-BE-NEXT: mtctr r0 +; CHECK-BE-NEXT: .LBB1_1: # %entry +; CHECK-BE-NEXT: # +; CHECK-BE-NEXT: stdu r12, -16(r1) +; CHECK-BE-NEXT: bdnz .LBB1_1 +; CHECK-BE-NEXT: # %bb.2: # %entry +; CHECK-BE-NEXT: li r3, 3 +; CHECK-BE-NEXT: stb r3, 64(r1) +; CHECK-BE-NEXT: lbz r3, 64(r1) +; CHECK-BE-NEXT: addi r1, r1, 4160 +; CHECK-BE-NEXT: blr +; +; CHECK-32-LABEL: f1: +; CHECK-32: # %bb.0: # %entry +; CHECK-32-NEXT: mr r12, r1 +; CHECK-32-NEXT: li r0, 257 +; CHECK-32-NEXT: mtctr r0 +; CHECK-32-NEXT: .LBB1_1: # %entry +; CHECK-32-NEXT: # +; CHECK-32-NEXT: stwu r12, -16(r1) +; CHECK-32-NEXT: bdnz .LBB1_1 +; CHECK-32-NEXT: # %bb.2: # %entry +; CHECK-32-NEXT: li r3, 3 +; CHECK-32-NEXT: sub r0, r1, r12 +; CHECK-32-NEXT: stb r3, 16(r1) +; CHECK-32-NEXT: sub r0, r1, r0 +; CHECK-32-NEXT: lbz r3, 16(r1) +; CHECK-32-NEXT: addi r1, r1, 4112 +; CHECK-32-NEXT: blr +entry: + %a = alloca i8, i64 4096 + %b = getelementptr inbounds i8, i8* %a, i64 63 + store volatile i8 3, i8* %a + %c = load volatile i8, i8* %a + ret i8 %c +} + +define i8 @f2() #0 nounwind { +; CHECK-LE-LABEL: f2: +; CHECK-LE: # %bb.0: # %entry +; CHECK-LE-NEXT: mr r12, r1 +; CHECK-LE-NEXT: stdu r12, -48(r1) +; CHECK-LE-NEXT: li r0, 16 +; CHECK-LE-NEXT: mtctr r0 +; CHECK-LE-NEXT: .LBB2_1: # %entry +; CHECK-LE-NEXT: # +; CHECK-LE-NEXT: stdu r12, -4096(r1) +; CHECK-LE-NEXT: bdnz .LBB2_1 +; CHECK-LE-NEXT: # %bb.2: # %entry +; CHECK-LE-NEXT: li r3, 3 +; CHECK-LE-NEXT: stb r3, 48(r1) +; CHECK-LE-NEXT: lbz r3, 48(r1) +; CHECK-LE-NEXT: ld r1, 0(r1) +; CHECK-LE-NEXT: blr +; +; CHECK-BE-LABEL: f2: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: mr r12, r1 +; CHECK-BE-NEXT: stdu r12, -64(r1) +; CHECK-BE-NEXT: li r0, 16 +; CHECK-BE-NEXT: mtctr r0 +; CHECK-BE-NEXT: .LBB2_1: # %entry +; CHECK-BE-NEXT: # +; CHECK-BE-NEXT: stdu r12, -4096(r1) +; CHECK-BE-NEXT: bdnz .LBB2_1 +; CHECK-BE-NEXT: # %bb.2: # %entry +; CHECK-BE-NEXT: li r3, 3 +; CHECK-BE-NEXT: stb r3, 64(r1) +; CHECK-BE-NEXT: lbz r3, 64(r1) +; CHECK-BE-NEXT: ld r1, 0(r1) +; CHECK-BE-NEXT: blr +; +; CHECK-32-LABEL: f2: +; CHECK-32: # %bb.0: # %entry +; CHECK-32-NEXT: mr r12, r1 +; CHECK-32-NEXT: stwu r12, -16(r1) +; CHECK-32-NEXT: li r0, 16 +; CHECK-32-NEXT: mtctr r0 +; CHECK-32-NEXT: .LBB2_1: # %entry +; CHECK-32-NEXT: # +; CHECK-32-NEXT: stwu r12, -4096(r1) +; CHECK-32-NEXT: bdnz .LBB2_1 +; CHECK-32-NEXT: # %bb.2: # %entry +; CHECK-32-NEXT: sub r0, r1, r12 +; CHECK-32-NEXT: li r3, 3 +; CHECK-32-NEXT: sub r0, r1, r0 +; CHECK-32-NEXT: stb r3, 16(r1) +; CHECK-32-NEXT: mr r0, r31 +; CHECK-32-NEXT: lbz r3, 16(r1) +; CHECK-32-NEXT: lwz r31, 0(r1) +; CHECK-32-NEXT: mr r1, r31 +; CHECK-32-NEXT: mr r31, r0 +; CHECK-32-NEXT: blr +entry: + %a = alloca i8, i64 65536 + %b = getelementptr inbounds i8, i8* %a, i64 63 + store volatile i8 3, i8* %a + %c = load volatile i8, i8* %a + ret i8 %c +} + +define i8 @f3() #0 "stack-probe-size"="32768" nounwind { +; CHECK-LE-LABEL: f3: +; CHECK-LE: # %bb.0: # %entry +; CHECK-LE-NEXT: mr r12, r1 +; CHECK-LE-NEXT: stdu r12, -48(r1) +; CHECK-LE-NEXT: stdu r12, -32768(r1) +; CHECK-LE-NEXT: stdu r12, -32768(r1) +; CHECK-LE-NEXT: li r3, 3 +; CHECK-LE-NEXT: stb r3, 48(r1) +; CHECK-LE-NEXT: lbz r3, 48(r1) +; CHECK-LE-NEXT: ld r1, 0(r1) +; CHECK-LE-NEXT: blr +; +; CHECK-BE-LABEL: f3: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: mr r12, r1 +; CHECK-BE-NEXT: stdu r12, -64(r1) +; CHECK-BE-NEXT: stdu r12, -32768(r1) +; CHECK-BE-NEXT: stdu r12, -32768(r1) +; CHECK-BE-NEXT: li r3, 3 +; CHECK-BE-NEXT: stb r3, 64(r1) +; CHECK-BE-NEXT: lbz r3, 64(r1) +; CHECK-BE-NEXT: ld r1, 0(r1) +; CHECK-BE-NEXT: blr +; +; CHECK-32-LABEL: f3: +; CHECK-32: # %bb.0: # %entry +; CHECK-32-NEXT: mr r12, r1 +; CHECK-32-NEXT: stwu r12, -16(r1) +; CHECK-32-NEXT: stwu r12, -32768(r1) +; CHECK-32-NEXT: stwu r12, -32768(r1) +; CHECK-32-NEXT: sub r0, r1, r12 +; CHECK-32-NEXT: li r3, 3 +; CHECK-32-NEXT: sub r0, r1, r0 +; CHECK-32-NEXT: stb r3, 16(r1) +; CHECK-32-NEXT: mr r0, r31 +; CHECK-32-NEXT: lbz r3, 16(r1) +; CHECK-32-NEXT: lwz r31, 0(r1) +; CHECK-32-NEXT: mr r1, r31 +; CHECK-32-NEXT: mr r31, r0 +; CHECK-32-NEXT: blr +entry: + %a = alloca i8, i64 65536 + %b = getelementptr inbounds i8, i8* %a, i64 63 + store volatile i8 3, i8* %a + %c = load volatile i8, i8* %a + ret i8 %c +} + +; Same as f2, but without protection. +define i8 @f4() nounwind { +; CHECK-LE-LABEL: f4: +; CHECK-LE: # %bb.0: # %entry +; CHECK-LE-NEXT: lis r0, -2 +; CHECK-LE-NEXT: ori r0, r0, 65488 +; CHECK-LE-NEXT: stdux r1, r1, r0 +; CHECK-LE-NEXT: li r3, 3 +; CHECK-LE-NEXT: stb r3, 48(r1) +; CHECK-LE-NEXT: lbz r3, 48(r1) +; CHECK-LE-NEXT: ld r1, 0(r1) +; CHECK-LE-NEXT: blr +; +; CHECK-BE-LABEL: f4: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lis r0, -2 +; CHECK-BE-NEXT: ori r0, r0, 65472 +; CHECK-BE-NEXT: stdux r1, r1, r0 +; CHECK-BE-NEXT: li r3, 3 +; CHECK-BE-NEXT: stb r3, 64(r1) +; CHECK-BE-NEXT: lbz r3, 64(r1) +; CHECK-BE-NEXT: ld r1, 0(r1) +; CHECK-BE-NEXT: blr +; +; CHECK-32-LABEL: f4: +; CHECK-32: # %bb.0: # %entry +; CHECK-32-NEXT: lis r0, -2 +; CHECK-32-NEXT: ori r0, r0, 65520 +; CHECK-32-NEXT: stwux r1, r1, r0 +; CHECK-32-NEXT: li r3, 3 +; CHECK-32-NEXT: sub r0, r1, r0 +; CHECK-32-NEXT: stb r3, 16(r1) +; CHECK-32-NEXT: mr r0, r31 +; CHECK-32-NEXT: lbz r3, 16(r1) +; CHECK-32-NEXT: lwz r31, 0(r1) +; CHECK-32-NEXT: mr r1, r31 +; CHECK-32-NEXT: mr r31, r0 +; CHECK-32-NEXT: blr +entry: + %a = alloca i8, i64 65536 + %b = getelementptr inbounds i8, i8* %a, i64 63 + store volatile i8 3, i8* %a + %c = load volatile i8, i8* %a + ret i8 %c +} + +define i8 @f5() #0 "stack-probe-size"="65536" nounwind { +; CHECK-LE-LABEL: f5: +; CHECK-LE: # %bb.0: # %entry +; CHECK-LE-NEXT: mr r12, r1 +; CHECK-LE-NEXT: stdu r12, -48(r1) +; CHECK-LE-NEXT: li r0, 16 +; CHECK-LE-NEXT: mtctr r0 +; CHECK-LE-NEXT: lis r0, -1 +; CHECK-LE-NEXT: nop +; CHECK-LE-NEXT: .LBB5_1: # %entry +; CHECK-LE-NEXT: # +; CHECK-LE-NEXT: stdux r12, r1, r0 +; CHECK-LE-NEXT: bdnz .LBB5_1 +; CHECK-LE-NEXT: # %bb.2: # %entry +; CHECK-LE-NEXT: li r3, 3 +; CHECK-LE-NEXT: stb r3, 48(r1) +; CHECK-LE-NEXT: lbz r3, 48(r1) +; CHECK-LE-NEXT: ld r1, 0(r1) +; CHECK-LE-NEXT: blr +; +; CHECK-BE-LABEL: f5: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: mr r12, r1 +; CHECK-BE-NEXT: stdu r12, -64(r1) +; CHECK-BE-NEXT: li r0, 16 +; CHECK-BE-NEXT: mtctr r0 +; CHECK-BE-NEXT: lis r0, -1 +; CHECK-BE-NEXT: nop +; CHECK-BE-NEXT: .LBB5_1: # %entry +; CHECK-BE-NEXT: # +; CHECK-BE-NEXT: stdux r12, r1, r0 +; CHECK-BE-NEXT: bdnz .LBB5_1 +; CHECK-BE-NEXT: # %bb.2: # %entry +; CHECK-BE-NEXT: li r3, 3 +; CHECK-BE-NEXT: stb r3, 64(r1) +; CHECK-BE-NEXT: lbz r3, 64(r1) +; CHECK-BE-NEXT: ld r1, 0(r1) +; CHECK-BE-NEXT: blr +; +; CHECK-32-LABEL: f5: +; CHECK-32: # %bb.0: # %entry +; CHECK-32-NEXT: mr r12, r1 +; CHECK-32-NEXT: stwu r12, -16(r1) +; CHECK-32-NEXT: li r0, 16 +; CHECK-32-NEXT: mtctr r0 +; CHECK-32-NEXT: lis r0, -1 +; CHECK-32-NEXT: nop +; CHECK-32-NEXT: .LBB5_1: # %entry +; CHECK-32-NEXT: # +; CHECK-32-NEXT: stwux r12, r1, r0 +; CHECK-32-NEXT: bdnz .LBB5_1 +; CHECK-32-NEXT: # %bb.2: # %entry +; CHECK-32-NEXT: sub r0, r1, r12 +; CHECK-32-NEXT: li r3, 3 +; CHECK-32-NEXT: sub r0, r1, r0 +; CHECK-32-NEXT: stb r3, 16(r1) +; CHECK-32-NEXT: mr r0, r31 +; CHECK-32-NEXT: lbz r3, 16(r1) +; CHECK-32-NEXT: lwz r31, 0(r1) +; CHECK-32-NEXT: mr r1, r31 +; CHECK-32-NEXT: mr r31, r0 +; CHECK-32-NEXT: blr +entry: + %a = alloca i8, i64 1048576 + %b = getelementptr inbounds i8, i8* %a, i64 63 + store volatile i8 3, i8* %a + %c = load volatile i8, i8* %a + ret i8 %c +} + +define i8 @f6() #0 nounwind { +; CHECK-LE-LABEL: f6: +; CHECK-LE: # %bb.0: # %entry +; CHECK-LE-NEXT: mr r12, r1 +; CHECK-LE-NEXT: stdu r12, -48(r1) +; CHECK-LE-NEXT: lis r0, 4 +; CHECK-LE-NEXT: nop +; CHECK-LE-NEXT: mtctr r0 +; CHECK-LE-NEXT: .LBB6_1: # %entry +; CHECK-LE-NEXT: # +; CHECK-LE-NEXT: stdu r12, -4096(r1) +; CHECK-LE-NEXT: bdnz .LBB6_1 +; CHECK-LE-NEXT: # %bb.2: # %entry +; CHECK-LE-NEXT: li r3, 3 +; CHECK-LE-NEXT: stb r3, 48(r1) +; CHECK-LE-NEXT: lbz r3, 48(r1) +; CHECK-LE-NEXT: ld r1, 0(r1) +; CHECK-LE-NEXT: blr +; +; CHECK-BE-LABEL: f6: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: mr r12, r1 +; CHECK-BE-NEXT: stdu r12, -64(r1) +; CHECK-BE-NEXT: lis r0, 4 +; CHECK-BE-NEXT: nop +; CHECK-BE-NEXT: mtctr r0 +; CHECK-BE-NEXT: .LBB6_1: # %entry +; CHECK-BE-NEXT: # +; CHECK-BE-NEXT: stdu r12, -4096(r1) +; CHECK-BE-NEXT: bdnz .LBB6_1 +; CHECK-BE-NEXT: # %bb.2: # %entry +; CHECK-BE-NEXT: li r3, 3 +; CHECK-BE-NEXT: stb r3, 64(r1) +; CHECK-BE-NEXT: lbz r3, 64(r1) +; CHECK-BE-NEXT: ld r1, 0(r1) +; CHECK-BE-NEXT: blr +; +; CHECK-32-LABEL: f6: +; CHECK-32: # %bb.0: # %entry +; CHECK-32-NEXT: mr r12, r1 +; CHECK-32-NEXT: stwu r12, -16(r1) +; CHECK-32-NEXT: lis r0, 4 +; CHECK-32-NEXT: nop +; CHECK-32-NEXT: mtctr r0 +; CHECK-32-NEXT: .LBB6_1: # %entry +; CHECK-32-NEXT: # +; CHECK-32-NEXT: stwu r12, -4096(r1) +; CHECK-32-NEXT: bdnz .LBB6_1 +; CHECK-32-NEXT: # %bb.2: # %entry +; CHECK-32-NEXT: sub r0, r1, r12 +; CHECK-32-NEXT: li r3, 3 +; CHECK-32-NEXT: sub r0, r1, r0 +; CHECK-32-NEXT: stb r3, 16(r1) +; CHECK-32-NEXT: mr r0, r31 +; CHECK-32-NEXT: lbz r3, 16(r1) +; CHECK-32-NEXT: lwz r31, 0(r1) +; CHECK-32-NEXT: mr r1, r31 +; CHECK-32-NEXT: mr r31, r0 +; CHECK-32-NEXT: blr +entry: + %a = alloca i8, i64 1073741824 + %b = getelementptr inbounds i8, i8* %a, i64 63 + store volatile i8 3, i8* %a + %c = load volatile i8, i8* %a + ret i8 %c +} + +define i8 @f7() #0 "stack-probe-size"="65536" nounwind { +; CHECK-LE-LABEL: f7: +; CHECK-LE: # %bb.0: # %entry +; CHECK-LE-NEXT: lis r0, -1 +; CHECK-LE-NEXT: mr r12, r1 +; CHECK-LE-NEXT: ori r0, r0, 13776 +; CHECK-LE-NEXT: stdux r12, r1, r0 +; CHECK-LE-NEXT: li r0, 15258 +; CHECK-LE-NEXT: mtctr r0 +; CHECK-LE-NEXT: lis r0, -1 +; CHECK-LE-NEXT: nop +; CHECK-LE-NEXT: .LBB7_1: # %entry +; CHECK-LE-NEXT: # +; CHECK-LE-NEXT: stdux r12, r1, r0 +; CHECK-LE-NEXT: bdnz .LBB7_1 +; CHECK-LE-NEXT: # %bb.2: # %entry +; CHECK-LE-NEXT: li r3, 3 +; CHECK-LE-NEXT: stb r3, 41(r1) +; CHECK-LE-NEXT: lbz r3, 41(r1) +; CHECK-LE-NEXT: ld r1, 0(r1) +; CHECK-LE-NEXT: blr +; +; CHECK-BE-LABEL: f7: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lis r0, -1 +; CHECK-BE-NEXT: mr r12, r1 +; CHECK-BE-NEXT: ori r0, r0, 13760 +; CHECK-BE-NEXT: stdux r12, r1, r0 +; CHECK-BE-NEXT: li r0, 15258 +; CHECK-BE-NEXT: mtctr r0 +; CHECK-BE-NEXT: lis r0, -1 +; CHECK-BE-NEXT: nop +; CHECK-BE-NEXT: .LBB7_1: # %entry +; CHECK-BE-NEXT: # +; CHECK-BE-NEXT: stdux r12, r1, r0 +; CHECK-BE-NEXT: bdnz .LBB7_1 +; CHECK-BE-NEXT: # %bb.2: # %entry +; CHECK-BE-NEXT: li r3, 3 +; CHECK-BE-NEXT: stb r3, 57(r1) +; CHECK-BE-NEXT: lbz r3, 57(r1) +; CHECK-BE-NEXT: ld r1, 0(r1) +; CHECK-BE-NEXT: blr +; +; CHECK-32-LABEL: f7: +; CHECK-32: # %bb.0: # %entry +; CHECK-32-NEXT: lis r0, -1 +; CHECK-32-NEXT: mr r12, r1 +; CHECK-32-NEXT: ori r0, r0, 13808 +; CHECK-32-NEXT: stwux r12, r1, r0 +; CHECK-32-NEXT: li r0, 15258 +; CHECK-32-NEXT: mtctr r0 +; CHECK-32-NEXT: lis r0, -1 +; CHECK-32-NEXT: nop +; CHECK-32-NEXT: .LBB7_1: # %entry +; CHECK-32-NEXT: # +; CHECK-32-NEXT: stwux r12, r1, r0 +; CHECK-32-NEXT: bdnz .LBB7_1 +; CHECK-32-NEXT: # %bb.2: # %entry +; CHECK-32-NEXT: sub r0, r1, r12 +; CHECK-32-NEXT: li r3, 3 +; CHECK-32-NEXT: sub r0, r1, r0 +; CHECK-32-NEXT: stb r3, 9(r1) +; CHECK-32-NEXT: mr r0, r31 +; CHECK-32-NEXT: lbz r3, 9(r1) +; CHECK-32-NEXT: lwz r31, 0(r1) +; CHECK-32-NEXT: mr r1, r31 +; CHECK-32-NEXT: mr r31, r0 +; CHECK-32-NEXT: blr +entry: + %a = alloca i8, i64 1000000007 + %b = getelementptr inbounds i8, i8* %a, i64 101 + store volatile i8 3, i8* %a + %c = load volatile i8, i8* %a + ret i8 %c +} + +attributes #0 = { "probe-stack"="inline-asm" }