diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.h b/llvm/lib/Target/PowerPC/PPCFrameLowering.h --- a/llvm/lib/Target/PowerPC/PPCFrameLowering.h +++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.h @@ -63,6 +63,7 @@ bool TwoUniqueRegsRequired = false, Register *SR1 = nullptr, Register *SR2 = nullptr) const; + bool findCRRegister(MachineBasicBlock *PrologMBB, Register *CRReg) const; bool twoUniqueScratchRegsRequired(MachineBasicBlock *MBB) const; /** @@ -100,6 +101,8 @@ /// the function. void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + void inlineStackProbe(MachineFunction &MF, + MachineBasicBlock &PrologMBB) const override; bool hasFP(const MachineFunction &MF) const override; bool needsFP(const MachineFunction &MF) const; diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -10,6 +10,7 @@ // //===----------------------------------------------------------------------===// +#include "MCTargetDesc/PPCPredicates.h" #include "PPCFrameLowering.h" #include "PPCInstrBuilder.h" #include "PPCInstrInfo.h" @@ -672,6 +673,32 @@ return true; } +// Use RegScavenger to find an available CR register for probing stack +// allocation in prologue MBB. +bool PPCFrameLowering::findCRRegister(MachineBasicBlock *MBB, + Register *CRReg) const { + RegScavenger RS; + RS.enterBasicBlock(*MBB); + static const Register CallerSavedCandidate[] = { + PPC::CR0, PPC::CR1, PPC::CR5, PPC::CR6, PPC::CR7, + }; + if (!MBB->empty()) { + MachineBasicBlock::iterator MBBI = MBB->getFirstTerminator(); + if (MBBI == MBB->end()) + MBBI = std::prev(MBBI); + if (MBBI != MBB->begin()) + RS.forward(MBBI); + } + auto Candidate = std::find_if( + std::begin(CallerSavedCandidate), std::end(CallerSavedCandidate), + [&RS](Register Reg) { return !RS.isRegUsed(Reg); }); + if (Candidate != std::end(CallerSavedCandidate)) { + *CRReg = *Candidate; + return true; + } + return false; +} + // We need a scratch register for spilling LR and for spilling CR. By default, // we use two scratch registers to hide latency. However, if only one scratch // register is available, we can adjust for that by not overlapping the spill @@ -755,6 +782,7 @@ MachineFrameInfo &MFI = MF.getFrameInfo(); const PPCInstrInfo &TII = *Subtarget.getInstrInfo(); const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + const PPCTargetLowering &TLI = *Subtarget.getTargetLowering(); MachineModuleInfo &MMI = MF.getMMI(); const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); @@ -843,9 +871,10 @@ "FrameSize must be >0 to save/restore the FP or LR for 32-bit SVR4."); // Using the same bool variable as below to suppress compiler warnings. - bool SingleScratchReg = - findScratchRegister(&MBB, false, twoUniqueScratchRegsRequired(&MBB), - &ScratchReg, &TempReg); + bool SingleScratchReg = findScratchRegister( + &MBB, false, + twoUniqueScratchRegsRequired(&MBB) || TLI.hasInlineStackProbe(MF), + &ScratchReg, &TempReg); assert(SingleScratchReg && "Required number of registers not available in this block"); @@ -1021,58 +1050,69 @@ // the negated frame size will be placed in ScratchReg. bool HasSTUX = false; - // This condition must be kept in sync with canUseAsPrologue. - if (HasBP && MaxAlign > 1) { - if (isPPC64) - BuildMI(MBB, MBBI, dl, TII.get(PPC::RLDICL), ScratchReg) - .addReg(SPReg) - .addImm(0) - .addImm(64 - Log2(MaxAlign)); - else // PPC32... - BuildMI(MBB, MBBI, dl, TII.get(PPC::RLWINM), ScratchReg) - .addReg(SPReg) - .addImm(0) - .addImm(32 - Log2(MaxAlign)) - .addImm(31); - if (!isLargeFrame) { - BuildMI(MBB, MBBI, dl, SubtractImmCarryingInst, ScratchReg) - .addReg(ScratchReg, RegState::Kill) + if (TLI.hasInlineStackProbe(MF) && FrameSize > TLI.getStackProbeSize(MF)) { + BuildMI(MBB, MBBI, dl, + TII.get(isPPC64 ? PPC::PROBED_STACKALLOC_64 + : PPC::PROBED_STACKALLOC_32)) + .addDef(ScratchReg) + .addDef(TempReg) .addImm(NegFrameSize); - } else { - assert(!SingleScratchReg && "Only a single scratch reg available"); - BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, TempReg) - .addImm(NegFrameSize >> 16); - BuildMI(MBB, MBBI, dl, OrImmInst, TempReg) - .addReg(TempReg, RegState::Kill) - .addImm(NegFrameSize & 0xFFFF); - BuildMI(MBB, MBBI, dl, SubtractCarryingInst, ScratchReg) - .addReg(ScratchReg, RegState::Kill) - .addReg(TempReg, RegState::Kill); - } + // Update HasSTUX, keep sync with inlineStackProbe. + HasSTUX = HasBP && MaxAlign > 1; + } else { + // This condition must be kept in sync with canUseAsPrologue. + if (HasBP && MaxAlign > 1) { + if (isPPC64) + BuildMI(MBB, MBBI, dl, TII.get(PPC::RLDICL), ScratchReg) + .addReg(SPReg) + .addImm(0) + .addImm(64 - Log2(MaxAlign)); + else // PPC32... + BuildMI(MBB, MBBI, dl, TII.get(PPC::RLWINM), ScratchReg) + .addReg(SPReg) + .addImm(0) + .addImm(32 - Log2(MaxAlign)) + .addImm(31); + if (!isLargeFrame) { + BuildMI(MBB, MBBI, dl, SubtractImmCarryingInst, ScratchReg) + .addReg(ScratchReg, RegState::Kill) + .addImm(NegFrameSize); + } else { + assert(!SingleScratchReg && "Only a single scratch reg available"); + BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, TempReg) + .addImm(NegFrameSize >> 16); + BuildMI(MBB, MBBI, dl, OrImmInst, TempReg) + .addReg(TempReg, RegState::Kill) + .addImm(NegFrameSize & 0xFFFF); + BuildMI(MBB, MBBI, dl, SubtractCarryingInst, ScratchReg) + .addReg(ScratchReg, RegState::Kill) + .addReg(TempReg, RegState::Kill); + } - BuildMI(MBB, MBBI, dl, StoreUpdtIdxInst, SPReg) - .addReg(SPReg, RegState::Kill) - .addReg(SPReg) - .addReg(ScratchReg); - HasSTUX = true; + BuildMI(MBB, MBBI, dl, StoreUpdtIdxInst, SPReg) + .addReg(SPReg, RegState::Kill) + .addReg(SPReg) + .addReg(ScratchReg); + HasSTUX = true; - } else if (!isLargeFrame) { - BuildMI(MBB, StackUpdateLoc, dl, StoreUpdtInst, SPReg) - .addReg(SPReg) - .addImm(NegFrameSize) - .addReg(SPReg); + } else if (!isLargeFrame) { + BuildMI(MBB, StackUpdateLoc, dl, StoreUpdtInst, SPReg) + .addReg(SPReg) + .addImm(NegFrameSize) + .addReg(SPReg); - } else { - BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg) - .addImm(NegFrameSize >> 16); - BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg) - .addReg(ScratchReg, RegState::Kill) - .addImm(NegFrameSize & 0xFFFF); - BuildMI(MBB, MBBI, dl, StoreUpdtIdxInst, SPReg) - .addReg(SPReg, RegState::Kill) - .addReg(SPReg) - .addReg(ScratchReg); - HasSTUX = true; + } else { + BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg) + .addImm(NegFrameSize >> 16); + BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg) + .addReg(ScratchReg, RegState::Kill) + .addImm(NegFrameSize & 0xFFFF); + BuildMI(MBB, MBBI, dl, StoreUpdtIdxInst, SPReg) + .addReg(SPReg, RegState::Kill) + .addReg(SPReg) + .addReg(ScratchReg); + HasSTUX = true; + } } // Save the TOC register after the stack pointer update if a prologue TOC @@ -1333,6 +1373,131 @@ } } +void PPCFrameLowering::inlineStackProbe(MachineFunction &MF, + MachineBasicBlock &PrologMBB) const { + // TODO: Generate CFI instructions. + bool isPPC64 = Subtarget.isPPC64(); + const PPCTargetLowering &TLI = *Subtarget.getTargetLowering(); + const PPCInstrInfo &TII = *Subtarget.getInstrInfo(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + auto Where = llvm::find_if(PrologMBB, [](MachineInstr &MI) { + int Opc = MI.getOpcode(); + return Opc == PPC::PROBED_STACKALLOC_64 || Opc == PPC::PROBED_STACKALLOC_32; + }); + if (Where == PrologMBB.end()) + return; + const BasicBlock *LLVM_BB = PrologMBB.getBasicBlock(); + DebugLoc DL = PrologMBB.findDebugLoc(Where); + MachineInstr &MI = *Where; + int64_t NegFrameSize = MI.getOperand(2).getImm(); + int64_t NegProbeSize = -(int64_t)TLI.getStackProbeSize(MF); + assert(NegProbeSize % 4 == 0 && + "PPC's D-form requires imm is a multiple of 4"); + int64_t NumBlocks = NegFrameSize / NegProbeSize; + int64_t NegResidualSize = NegFrameSize % NegProbeSize; + Register SPReg = isPPC64 ? PPC::X1 : PPC::R1; + Register ScratchReg = MI.getOperand(0).getReg(); + Register FPReg = MI.getOperand(1).getReg(); + const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + bool HasBP = RegInfo->hasBasePointer(MF); + bool isLargeFrame = !isInt<16>(NegFrameSize); + Align MaxAlign = MFI.getMaxAlign(); + // Initialize current frame pointer. + const MCInstrDesc &CopyInst = TII.get(isPPC64 ? PPC::OR8 : PPC::OR); + BuildMI(PrologMBB, {MI}, DL, CopyInst, FPReg).addReg(SPReg).addReg(SPReg); + auto allocateAndProbe = [&](MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + int64_t NegSize) { + BuildMI(MBB, MBBI, DL, TII.get(isPPC64 ? PPC::STDU : PPC::STWU), SPReg) + .addReg(FPReg) + .addImm(NegSize) + .addReg(SPReg); + }; + // For case HasBP && MaxAlign > 1, SP = SP + (NegFramSize - SP % MaxAlign). We + // can split into two parts, + // 1. SP = SP - SP % MaxAlign + // 2. SP = SP + NegFrameSize + if (HasBP && MaxAlign > 1) { + if (isPPC64) + BuildMI(PrologMBB, {MI}, DL, TII.get(PPC::RLDICL), ScratchReg) + .addReg(FPReg) + .addImm(0) + .addImm(64 - Log2(MaxAlign)); + else + BuildMI(PrologMBB, {MI}, DL, TII.get(PPC::RLWINM), ScratchReg) + .addReg(FPReg) + .addImm(0) + .addImm(32 - Log2(MaxAlign)) + .addImm(31); + BuildMI(PrologMBB, {MI}, DL, TII.get(isPPC64 ? PPC::STDUX : PPC::STWUX), + SPReg) + .addReg(FPReg) + .addReg(SPReg) + .addReg(ScratchReg); + } + // If number of blocks is small, just probe them directly. + if (NumBlocks < 3) { + if (NegResidualSize) + allocateAndProbe(PrologMBB, {MI}, NegResidualSize); + for (int i = 0; i < NumBlocks; ++i) + allocateAndProbe(PrologMBB, {MI}, NegProbeSize); + } else { + // Synthesize a loop to probe. + Register CRReg; + bool FoundCRReg = findCRRegister(&PrologMBB, &CRReg); + assert(FoundCRReg && "Can't find available cr register"); + (void)FoundCRReg; + // Calcuate final SPReg and store in ScratchReg. + if (!isLargeFrame) + BuildMI(PrologMBB, {MI}, DL, TII.get(isPPC64 ? PPC::ADDI8 : PPC::ADDI), + ScratchReg) + .addReg(SPReg) + .addImm(NegFrameSize); + else { + BuildMI(PrologMBB, {MI}, DL, TII.get(isPPC64 ? PPC::LIS8 : PPC::LIS), + ScratchReg) + .addImm(NegFrameSize >> 16); + BuildMI(PrologMBB, {MI}, DL, TII.get(isPPC64 ? PPC::ORI8 : PPC::ORI), + ScratchReg) + .addReg(ScratchReg, RegState::Kill) + .addImm(NegFrameSize & 0xFFFF); + BuildMI(PrologMBB, {MI}, DL, TII.get(isPPC64 ? PPC::ADD8 : PPC::ADD4), + ScratchReg) + .addReg(SPReg) + .addReg(ScratchReg, RegState::Kill); + } + // Probe residual part. + if (NegResidualSize) + allocateAndProbe(PrologMBB, {MI}, NegResidualSize); + // Synthesize a loop for probing. + MachineFunction::iterator MBBInsertPoint = std::next(PrologMBB.getIterator()); + MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB); + MF.insert(MBBInsertPoint, LoopMBB); + MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(LLVM_BB); + MF.insert(MBBInsertPoint, ExitMBB); + // Synthesize loop body. + LoopMBB->addLiveIn(SPReg); + LoopMBB->addLiveIn(ScratchReg); + LoopMBB->addLiveIn(FPReg); + allocateAndProbe(*LoopMBB, LoopMBB->end(), NegProbeSize); + BuildMI(LoopMBB, DL, TII.get(isPPC64 ? PPC::CMPD : PPC::CMPW), CRReg) + .addReg(SPReg) + .addReg(ScratchReg); + BuildMI(LoopMBB, DL, TII.get(PPC::BCC)) + .addImm(PPC::PRED_NE) + .addReg(CRReg, RegState::Kill) + .addMBB(LoopMBB); + LoopMBB->addSuccessor(ExitMBB); + LoopMBB->addSuccessor(LoopMBB); + // Synthesize exit MBB. + ExitMBB->splice(ExitMBB->end(), &PrologMBB, + std::next(MachineBasicBlock::iterator(MI)), PrologMBB.end()); + ExitMBB->transferSuccessorsAndUpdatePHIs(&PrologMBB); + PrologMBB.addSuccessor(LoopMBB); + } + MI.eraseFromParent(); +} + void PPCFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td --- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -434,6 +434,9 @@ def PREPARE_PROBED_ALLOCA_64 : PPCEmitTimePseudo<(outs g8rc:$fp, g8rc:$sp), (ins g8rc:$negsize, memri:$fpsi), "#PREPARE_PROBED_ALLOCA_64", []>; +def PROBED_STACKALLOC_64 : PPCEmitTimePseudo<(outs g8rc:$scratch, g8rc:$temp), + (ins i64imm:$stacksize), + "#PROBED_STACKALLOC_64", []>; } let hasSideEffects = 0 in { diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -1384,6 +1384,9 @@ def PREPARE_PROBED_ALLOCA_32 : PPCEmitTimePseudo<(outs gprc:$fp, gprc:$sp), (ins gprc:$negsize, memri:$fpsi), "#PREPARE_PROBED_ALLOCA_32", []>; +def PROBED_STACKALLOC_32 : PPCEmitTimePseudo<(outs gprc:$scratch, gprc:$temp), + (ins i64imm:$stacksize), + "#PROBED_STACKALLOC_32", []>; } // SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded after diff --git a/llvm/test/CodeGen/PowerPC/stack-clash-prologue.ll b/llvm/test/CodeGen/PowerPC/stack-clash-prologue.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/stack-clash-prologue.ll @@ -0,0 +1,246 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -ppc-asm-full-reg-names -verify-machineinstrs \ +; RUN: -mtriple=powerpc64le-linux-gnu < %s | FileCheck \ +; RUN: -check-prefix=CHECK-LE %s +; RUN: llc -ppc-asm-full-reg-names -verify-machineinstrs \ +; RUN: -mtriple=powerpc64-linux-gnu < %s | FileCheck \ +; RUN: -check-prefix=CHECK-BE %s +; RUN: llc -ppc-asm-full-reg-names -verify-machineinstrs \ +; RUN: -mtriple=powerpc-linux-gnu < %s | FileCheck \ +; RUN: -check-prefix=CHECK-32 %s + +; Free probe +define i8 @f0() #0 nounwind { +; CHECK-LE-LABEL: f0: +; CHECK-LE: # %bb.0: # %entry +; CHECK-LE-NEXT: li r3, 3 +; CHECK-LE-NEXT: stb r3, -64(r1) +; CHECK-LE-NEXT: lbz r3, -64(r1) +; CHECK-LE-NEXT: blr +; +; CHECK-BE-LABEL: f0: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: li r3, 3 +; CHECK-BE-NEXT: stb r3, -64(r1) +; CHECK-BE-NEXT: lbz r3, -64(r1) +; CHECK-BE-NEXT: blr +; +; CHECK-32-LABEL: f0: +; CHECK-32: # %bb.0: # %entry +; CHECK-32-NEXT: stwu r1, -80(r1) +; CHECK-32-NEXT: li r3, 3 +; CHECK-32-NEXT: stb r3, 16(r1) +; CHECK-32-NEXT: lbz r3, 16(r1) +; CHECK-32-NEXT: addi r1, r1, 80 +; CHECK-32-NEXT: blr +entry: + %a = alloca i8, i64 64 + %b = getelementptr inbounds i8, i8* %a, i64 63 + store volatile i8 3, i8* %a + %c = load volatile i8, i8* %a + ret i8 %c +} + +define i8 @f1() #0 nounwind { +; CHECK-LE-LABEL: f1: +; CHECK-LE: # %bb.0: # %entry +; CHECK-LE-NEXT: mr r12, r1 +; CHECK-LE-NEXT: stdu r12, -48(r1) +; CHECK-LE-NEXT: stdu r12, -4096(r1) +; CHECK-LE-NEXT: li r3, 3 +; CHECK-LE-NEXT: stb r3, 48(r1) +; CHECK-LE-NEXT: lbz r3, 48(r1) +; CHECK-LE-NEXT: addi r1, r1, 4144 +; CHECK-LE-NEXT: blr +; +; CHECK-BE-LABEL: f1: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: mr r12, r1 +; CHECK-BE-NEXT: stdu r12, -64(r1) +; CHECK-BE-NEXT: stdu r12, -4096(r1) +; CHECK-BE-NEXT: li r3, 3 +; CHECK-BE-NEXT: stb r3, 64(r1) +; CHECK-BE-NEXT: lbz r3, 64(r1) +; CHECK-BE-NEXT: addi r1, r1, 4160 +; CHECK-BE-NEXT: blr +; +; CHECK-32-LABEL: f1: +; CHECK-32: # %bb.0: # %entry +; CHECK-32-NEXT: mr r12, r1 +; CHECK-32-NEXT: stwu r12, -16(r1) +; CHECK-32-NEXT: stwu r12, -4096(r1) +; CHECK-32-NEXT: li r3, 3 +; CHECK-32-NEXT: stb r3, 16(r1) +; CHECK-32-NEXT: lbz r3, 16(r1) +; CHECK-32-NEXT: addi r1, r1, 4112 +; CHECK-32-NEXT: blr +entry: + %a = alloca i8, i64 4096 + %b = getelementptr inbounds i8, i8* %a, i64 63 + store volatile i8 3, i8* %a + %c = load volatile i8, i8* %a + ret i8 %c +} + +define i8 @f2() #0 nounwind { +; CHECK-LE-LABEL: f2: +; CHECK-LE: # %bb.0: # %entry +; CHECK-LE-NEXT: lis r0, -2 +; CHECK-LE-NEXT: mr r12, r1 +; CHECK-LE-NEXT: ori r0, r0, 65488 +; CHECK-LE-NEXT: add r0, r1, r0 +; CHECK-LE-NEXT: stdu r12, -48(r1) +; CHECK-LE-NEXT: .LBB2_1: # %entry +; CHECK-LE-NEXT: # +; CHECK-LE-NEXT: stdu r12, -4096(r1) +; CHECK-LE-NEXT: cmpd r1, r0 +; CHECK-LE-NEXT: bne cr0, .LBB2_1 +; CHECK-LE-NEXT: # %bb.2: # %entry +; CHECK-LE-NEXT: li r3, 3 +; CHECK-LE-NEXT: stb r3, 48(r1) +; CHECK-LE-NEXT: lbz r3, 48(r1) +; CHECK-LE-NEXT: ld r1, 0(r1) +; CHECK-LE-NEXT: blr +; +; CHECK-BE-LABEL: f2: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lis r0, -2 +; CHECK-BE-NEXT: ori r0, r0, 65472 +; CHECK-BE-NEXT: mr r12, r1 +; CHECK-BE-NEXT: add r0, r1, r0 +; CHECK-BE-NEXT: stdu r12, -64(r1) +; CHECK-BE-NEXT: .LBB2_1: # %entry +; CHECK-BE-NEXT: # +; CHECK-BE-NEXT: stdu r12, -4096(r1) +; CHECK-BE-NEXT: cmpd r1, r0 +; CHECK-BE-NEXT: bne cr0, .LBB2_1 +; CHECK-BE-NEXT: # %bb.2: # %entry +; CHECK-BE-NEXT: li r3, 3 +; CHECK-BE-NEXT: stb r3, 64(r1) +; CHECK-BE-NEXT: lbz r3, 64(r1) +; CHECK-BE-NEXT: ld r1, 0(r1) +; CHECK-BE-NEXT: blr +; +; CHECK-32-LABEL: f2: +; CHECK-32: # %bb.0: # %entry +; CHECK-32-NEXT: lis r0, -2 +; CHECK-32-NEXT: ori r0, r0, 65520 +; CHECK-32-NEXT: mr r12, r1 +; CHECK-32-NEXT: add r0, r1, r0 +; CHECK-32-NEXT: stwu r12, -16(r1) +; CHECK-32-NEXT: .LBB2_1: # %entry +; CHECK-32-NEXT: # +; CHECK-32-NEXT: stwu r12, -4096(r1) +; CHECK-32-NEXT: cmpw r1, r0 +; CHECK-32-NEXT: bne cr0, .LBB2_1 +; CHECK-32-NEXT: # %bb.2: # %entry +; CHECK-32-NEXT: li r3, 3 +; CHECK-32-NEXT: stb r3, 16(r1) +; CHECK-32-NEXT: mr r0, r31 +; CHECK-32-NEXT: lbz r3, 16(r1) +; CHECK-32-NEXT: lwz r31, 0(r1) +; CHECK-32-NEXT: mr r1, r31 +; CHECK-32-NEXT: mr r31, r0 +; CHECK-32-NEXT: blr +entry: + %a = alloca i8, i64 65536 + %b = getelementptr inbounds i8, i8* %a, i64 63 + store volatile i8 3, i8* %a + %c = load volatile i8, i8* %a + ret i8 %c +} + +define i8 @f3() #0 "stack-probe-size"="32768" nounwind { +; CHECK-LE-LABEL: f3: +; CHECK-LE: # %bb.0: # %entry +; CHECK-LE-NEXT: mr r12, r1 +; CHECK-LE-NEXT: stdu r12, -48(r1) +; CHECK-LE-NEXT: stdu r12, -32768(r1) +; CHECK-LE-NEXT: stdu r12, -32768(r1) +; CHECK-LE-NEXT: li r3, 3 +; CHECK-LE-NEXT: stb r3, 48(r1) +; CHECK-LE-NEXT: lbz r3, 48(r1) +; CHECK-LE-NEXT: ld r1, 0(r1) +; CHECK-LE-NEXT: blr +; +; CHECK-BE-LABEL: f3: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: mr r12, r1 +; CHECK-BE-NEXT: stdu r12, -64(r1) +; CHECK-BE-NEXT: stdu r12, -32768(r1) +; CHECK-BE-NEXT: stdu r12, -32768(r1) +; CHECK-BE-NEXT: li r3, 3 +; CHECK-BE-NEXT: stb r3, 64(r1) +; CHECK-BE-NEXT: lbz r3, 64(r1) +; CHECK-BE-NEXT: ld r1, 0(r1) +; CHECK-BE-NEXT: blr +; +; CHECK-32-LABEL: f3: +; CHECK-32: # %bb.0: # %entry +; CHECK-32-NEXT: mr r12, r1 +; CHECK-32-NEXT: stwu r12, -16(r1) +; CHECK-32-NEXT: stwu r12, -32768(r1) +; CHECK-32-NEXT: stwu r12, -32768(r1) +; CHECK-32-NEXT: li r3, 3 +; CHECK-32-NEXT: stb r3, 16(r1) +; CHECK-32-NEXT: mr r0, r31 +; CHECK-32-NEXT: lbz r3, 16(r1) +; CHECK-32-NEXT: lwz r31, 0(r1) +; CHECK-32-NEXT: mr r1, r31 +; CHECK-32-NEXT: mr r31, r0 +; CHECK-32-NEXT: blr +entry: + %a = alloca i8, i64 65536 + %b = getelementptr inbounds i8, i8* %a, i64 63 + store volatile i8 3, i8* %a + %c = load volatile i8, i8* %a + ret i8 %c +} + +; Same as f2, but without protection. +define i8 @f4() nounwind { +; CHECK-LE-LABEL: f4: +; CHECK-LE: # %bb.0: # %entry +; CHECK-LE-NEXT: lis r0, -2 +; CHECK-LE-NEXT: ori r0, r0, 65488 +; CHECK-LE-NEXT: stdux r1, r1, r0 +; CHECK-LE-NEXT: li r3, 3 +; CHECK-LE-NEXT: stb r3, 48(r1) +; CHECK-LE-NEXT: lbz r3, 48(r1) +; CHECK-LE-NEXT: ld r1, 0(r1) +; CHECK-LE-NEXT: blr +; +; CHECK-BE-LABEL: f4: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lis r0, -2 +; CHECK-BE-NEXT: ori r0, r0, 65472 +; CHECK-BE-NEXT: stdux r1, r1, r0 +; CHECK-BE-NEXT: li r3, 3 +; CHECK-BE-NEXT: stb r3, 64(r1) +; CHECK-BE-NEXT: lbz r3, 64(r1) +; CHECK-BE-NEXT: ld r1, 0(r1) +; CHECK-BE-NEXT: blr +; +; CHECK-32-LABEL: f4: +; CHECK-32: # %bb.0: # %entry +; CHECK-32-NEXT: lis r0, -2 +; CHECK-32-NEXT: ori r0, r0, 65520 +; CHECK-32-NEXT: stwux r1, r1, r0 +; CHECK-32-NEXT: li r3, 3 +; CHECK-32-NEXT: sub r0, r1, r0 +; CHECK-32-NEXT: stb r3, 16(r1) +; CHECK-32-NEXT: mr r0, r31 +; CHECK-32-NEXT: lbz r3, 16(r1) +; CHECK-32-NEXT: lwz r31, 0(r1) +; CHECK-32-NEXT: mr r1, r31 +; CHECK-32-NEXT: mr r31, r0 +; CHECK-32-NEXT: blr +entry: + %a = alloca i8, i64 65536 + %b = getelementptr inbounds i8, i8* %a, i64 63 + store volatile i8 3, i8* %a + %c = load volatile i8, i8* %a + ret i8 %c +} + +attributes #0 = { "probe-stack"="inline-asm" }