Index: llvm/lib/Target/PowerPC/PPCFrameLowering.cpp =================================================================== --- llvm/lib/Target/PowerPC/PPCFrameLowering.cpp +++ llvm/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -444,6 +444,11 @@ return RI !=MF.getRegInfo().def_end() || MFI->isLRStoreRequired(); } +static bool MustSaveTOC(const MachineFunction &MF) { + const PPCFunctionInfo *FI = MF.getInfo(); + return FI->mustSaveTOC(); +} + /// determineFrameLayoutAndUpdate - Determine the size of the frame and maximum /// call frame size. Update the MachineFunction object with the stack size. unsigned @@ -481,6 +486,7 @@ bool CanUseRedZone = !MFI.hasVarSizedObjects() && // No dynamic alloca. !MFI.adjustsStack() && // No calls. !MustSaveLR(MF, LR) && // No need to save LR. + !MustSaveTOC(MF) && // No need to save TOC. !RegInfo->hasBasePointer(MF); // No special alignment. // Note: for PPC32 SVR4ABI (Non-DarwinABI), we can still generate stackless @@ -808,6 +814,7 @@ // Check if the link register (LR) must be saved. PPCFunctionInfo *FI = MF.getInfo(); bool MustSaveLR = FI->mustSaveLR(); + bool MustSaveTOC = FI->mustSaveTOC(); const SmallVectorImpl &MustSaveCRs = FI->getMustSaveCRs(); bool MustSaveCR = !MustSaveCRs.empty(); // Do we have a frame pointer and/or base pointer for this function? @@ -819,6 +826,7 @@ unsigned BPReg = RegInfo->getBaseRegister(MF); unsigned FPReg = isPPC64 ? PPC::X31 : PPC::R31; unsigned LRReg = isPPC64 ? PPC::LR8 : PPC::LR; + unsigned TOCReg = isPPC64 ? PPC::X2 : PPC::R2; unsigned ScratchReg = 0; unsigned TempReg = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg // ...(R12/X12 is volatile in both Darwin & SVR4, & can't be a function arg.) @@ -1092,6 +1100,16 @@ HasSTUX = true; } + // Save the TOC register after the stack pointer update if a prologue TOC + // save is required for the function. + if (MustSaveTOC) { + assert(isELFv2ABI && "TOC saves in the prologue only supported on ELFv2"); + BuildMI(MBB, StackUpdateLoc, dl, TII.get(PPC::STD)) + .addReg(TOCReg, getKillRegState(true)) + .addImm(TOCSaveOffset) + .addReg(SPReg); + } + if (!HasRedZone) { assert(!isPPC64 && "A red zone is always available on PPC64"); if (HasSTUX) { @@ -1293,6 +1311,9 @@ if (PPC::CRBITRCRegClass.contains(Reg)) continue; + if (Reg == PPC::X2 || Reg == PPC::R2) + continue; + // For SVR4, don't emit a move for the CR spill slot if we haven't // spilled CRs. if (isSVR4ABI && (PPC::CR2 <= Reg && Reg <= PPC::CR4) @@ -1839,11 +1860,13 @@ unsigned MinFPR = PPC::F31; unsigned MinVR = Subtarget.hasSPE() ? PPC::S31 : PPC::V31; + PPCFunctionInfo *FI = MF.getInfo(); bool HasGPSaveArea = false; bool HasG8SaveArea = false; bool HasFPSaveArea = false; bool HasVRSAVESaveArea = false; bool HasVRSaveArea = false; + bool MustSaveTOC = FI->mustSaveTOC(); SmallVector GPRegs; SmallVector G8Regs; @@ -1852,8 +1875,9 @@ for (unsigned i = 0, e = CSI.size(); i != e; ++i) { unsigned Reg = CSI[i].getReg(); - if (PPC::GPRCRegClass.contains(Reg) || - PPC::SPE4RCRegClass.contains(Reg)) { + bool IgnoreR2 = MustSaveTOC && (Reg == PPC::X2 || Reg == PPC::R2); + if (!IgnoreR2 && (PPC::GPRCRegClass.contains(Reg) || + PPC::SPE4RCRegClass.contains(Reg))) { HasGPSaveArea = true; GPRegs.push_back(CSI[i]); @@ -1861,7 +1885,7 @@ if (Reg < MinGPR) { MinGPR = Reg; } - } else if (PPC::G8RCRegClass.contains(Reg)) { + } else if (!IgnoreR2 && PPC::G8RCRegClass.contains(Reg)) { HasG8SaveArea = true; G8Regs.push_back(CSI[i]); @@ -2161,6 +2185,8 @@ MachineFunction *MF = MBB.getParent(); const PPCInstrInfo &TII = *Subtarget.getInstrInfo(); + PPCFunctionInfo *FI = MF->getInfo(); + bool MustSaveTOC = FI->mustSaveTOC(); DebugLoc DL; bool CRSpilled = false; MachineInstrBuilder CRMIB; @@ -2191,6 +2217,10 @@ continue; } + // The actual spill will happen in the prologue. + if ((Reg == PPC::X2 || Reg == PPC::R2) && MustSaveTOC) + continue; + // Insert the spill to the stack frame. if (IsCRField) { PPCFunctionInfo *FuncInfo = MF->getInfo(); @@ -2318,6 +2348,8 @@ MachineFunction *MF = MBB.getParent(); const PPCInstrInfo &TII = *Subtarget.getInstrInfo(); + PPCFunctionInfo *FI = MF->getInfo(); + bool MustSaveTOC = FI->mustSaveTOC(); bool CR2Spilled = false; bool CR3Spilled = false; bool CR4Spilled = false; @@ -2340,6 +2372,9 @@ if (Reg == PPC::VRSAVE && !Subtarget.isDarwinABI()) continue; + if ((Reg == PPC::X2 || Reg == PPC::R2) && MustSaveTOC) + continue; + if (Reg == PPC::CR2) { CR2Spilled = true; // The spill slot is associated only with CR2, which is the Index: llvm/lib/Target/PowerPC/PPCMIPeephole.cpp =================================================================== --- llvm/lib/Target/PowerPC/PPCMIPeephole.cpp +++ llvm/lib/Target/PowerPC/PPCMIPeephole.cpp @@ -21,9 +21,12 @@ #include "PPC.h" #include "PPCInstrBuilder.h" #include "PPCInstrInfo.h" +#include "PPCMachineFunctionInfo.h" #include "PPCTargetMachine.h" #include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -37,6 +40,7 @@ STATISTIC(RemoveTOCSave, "Number of TOC saves removed"); STATISTIC(MultiTOCSaves, "Number of functions with multiple TOC saves that must be kept"); +STATISTIC(NumTOCSavesInPrologue, "Number of TOC saves placed in the prologue"); STATISTIC(NumEliminatedSExt, "Number of eliminated sign-extensions"); STATISTIC(NumEliminatedZExt, "Number of eliminated zero-extensions"); STATISTIC(NumOptADDLIs, "Number of optimized ADD instruction fed by LI"); @@ -84,6 +88,9 @@ private: MachineDominatorTree *MDT; + MachinePostDominatorTree *MPDT; + MachineBlockFrequencyInfo *MBFI; + uint64_t EntryFreq; // Initialize class variables. void initialize(MachineFunction &MFParm); @@ -102,7 +109,11 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); + AU.addRequired(); + AU.addRequired(); AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -120,6 +131,9 @@ MF = &MFParm; MRI = &MF->getRegInfo(); MDT = &getAnalysis(); + MPDT = &getAnalysis(); + MBFI = &getAnalysis(); + EntryFreq = MBFI->getEntryFreq(); TII = MF->getSubtarget().getInstrInfo(); LLVM_DEBUG(dbgs() << "*** PowerPC MI peephole pass ***\n\n"); LLVM_DEBUG(MF->dump()); @@ -200,6 +214,32 @@ void PPCMIPeephole::UpdateTOCSaves( std::map &TOCSaves, MachineInstr *MI) { assert(TII->isTOCSaveMI(*MI) && "Expecting a TOC save instruction here"); + assert(MF->getSubtarget().isELFv2ABI() && + "TOC-save removal only supported on ELFv2"); + PPCFunctionInfo *FI = MF->getInfo(); + MachineFrameInfo &MFI = MF->getFrameInfo(); + + MachineBasicBlock *Entry = &MF->front(); + uint64_t CurrBlockFreq = MBFI->getBlockFreq(MI->getParent()).getFrequency(); + + // If the block in which the TOC save resides is in a block that + // post-dominates Entry, or a block that is hotter than entry (keep in mind + // that early MachineLICM has already run so the TOC save won't be hoisted) + // we can just do the save in the prologue. + if (!MFI.hasVarSizedObjects() && + (CurrBlockFreq > EntryFreq || MPDT->dominates(MI->getParent(), Entry))) + FI->setMustSaveTOC(true); + + // If we are saving the TOC in the prologue, all the TOC saves can be removed + // from the code. + if (FI->mustSaveTOC()) { + for (auto &TOCSave : TOCSaves) + TOCSave.second = false; + // Add new instruction to map. + TOCSaves[MI] = false; + return; + } + bool Keep = true; for (auto It = TOCSaves.begin(); It != TOCSaves.end(); It++ ) { MachineInstr *CurrInst = It->first; @@ -777,6 +817,10 @@ // Eliminate all the TOC save instructions which are redundant. Simplified |= eliminateRedundantTOCSaves(TOCSaves); + PPCFunctionInfo *FI = MF->getInfo(); + if (FI->mustSaveTOC()) + NumTOCSavesInPrologue++; + // We try to eliminate redundant compare instruction. Simplified |= eliminateRedundantCompare(); @@ -1340,6 +1384,9 @@ INITIALIZE_PASS_BEGIN(PPCMIPeephole, DEBUG_TYPE, "PowerPC MI Peephole Optimization", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_END(PPCMIPeephole, DEBUG_TYPE, "PowerPC MI Peephole Optimization", false, false) Index: llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h =================================================================== --- llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h +++ llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h @@ -44,6 +44,12 @@ /// PEI. bool MustSaveLR; + /// MustSaveTOC - Indicates that the TOC save needs to be performed in the + /// prologue of the function. This is typically the case when there are + /// indirect calls in the function and it is more profitable to save the + /// TOC pointer in the prologue than in the block(s) containing the call(s). + bool MustSaveTOC = false; + /// Do we have to disable shrink-wrapping? This has to be set if we emit any /// instructions that clobber LR in the entry block because discovering this /// in PEI is too late (happens after shrink-wrapping); @@ -151,6 +157,9 @@ void setMustSaveLR(bool U) { MustSaveLR = U; } bool mustSaveLR() const { return MustSaveLR; } + void setMustSaveTOC(bool U) { MustSaveTOC = U; } + bool mustSaveTOC() const { return MustSaveTOC; } + /// We certainly don't want to shrink wrap functions if we've emitted a /// MovePCtoLR8 as that has to go into the entry, so the prologue definitely /// has to go into the entry block. Index: llvm/test/CodeGen/PowerPC/MCSE-caller-preserved-reg.ll =================================================================== --- llvm/test/CodeGen/PowerPC/MCSE-caller-preserved-reg.ll +++ llvm/test/CodeGen/PowerPC/MCSE-caller-preserved-reg.ll @@ -21,9 +21,9 @@ ; CHECK-NEXT: std 30, -16(1) ; CHECK-NEXT: std 0, 16(1) ; CHECK-NEXT: stdu 1, -48(1) -; CHECK-NEXT: ld 12, 0(3) -; CHECK-NEXT: mr 30, 3 ; CHECK-NEXT: std 2, 24(1) +; CHECK-NEXT: mr 30, 3 +; CHECK-NEXT: ld 12, 0(3) ; CHECK-NEXT: mtctr 12 ; CHECK-NEXT: bctrl ; CHECK-NEXT: ld 2, 24(1) Index: llvm/test/CodeGen/PowerPC/tocSaveInPrologue.ll =================================================================== --- llvm/test/CodeGen/PowerPC/tocSaveInPrologue.ll +++ llvm/test/CodeGen/PowerPC/tocSaveInPrologue.ll @@ -19,13 +19,12 @@ ; CHECK-NEXT: cmpwi cr1, r4, 11 ; CHECK-NEXT: mr r30, r3 ; CHECK-NEXT: extsw r28, r4 +; CHECK-NEXT: std r2, 24(r1) ; CHECK-NEXT: cmpwi r29, 1 ; CHECK-NEXT: cror 4*cr5+lt, lt, 4*cr1+lt -; CHECK-NEXT: bc 12, 4*cr5+lt, .LBB0_3 -; CHECK-NEXT: # %bb.1: # %for.body.us.preheader -; CHECK-NEXT: std r2, 24(r1) +; CHECK-NEXT: bc 12, 4*cr5+lt, .LBB0_2 ; CHECK-NEXT: .p2align 5 -; CHECK-NEXT: .LBB0_2: # %for.body.us +; CHECK-NEXT: .LBB0_1: # %for.body.us ; CHECK-NEXT: ; CHECK-NEXT: mtctr r30 ; CHECK-NEXT: mr r3, r28 @@ -34,12 +33,11 @@ ; CHECK-NEXT: ld 2, 24(r1) ; CHECK-NEXT: addi r29, r29, -1 ; CHECK-NEXT: cmplwi r29, 0 -; CHECK-NEXT: bne cr0, .LBB0_2 -; CHECK-NEXT: .LBB0_3: # %for.cond.cleanup +; CHECK-NEXT: bne cr0, .LBB0_1 +; CHECK-NEXT: .LBB0_2: # %for.cond.cleanup ; CHECK-NEXT: mtctr r30 ; CHECK-NEXT: mr r3, r28 ; CHECK-NEXT: mr r12, r30 -; CHECK-NEXT: std r2, 24(r1) ; CHECK-NEXT: bctrl ; CHECK-NEXT: ld 2, 24(r1) ; CHECK-NEXT: addi r1, r1, 64