Index: include/llvm/CodeGen/MachineFrameInfo.h =================================================================== --- include/llvm/CodeGen/MachineFrameInfo.h +++ include/llvm/CodeGen/MachineFrameInfo.h @@ -28,9 +28,14 @@ /// The CalleeSavedInfo class tracks the information need to locate where a /// callee saved register is in the current frame. +/// Callee saved reg can also be saved to a different register rather than +/// on the stack by setting DstReg instead of FrameIdx. class CalleeSavedInfo { unsigned Reg; - int FrameIdx; + union { + int FrameIdx; + unsigned DstReg; + }; /// Flag indicating whether the register is actually restored in the epilog. /// In most cases, if a register is saved, it is also restored. There are /// some situations, though, when this is not the case. For example, the @@ -44,17 +49,25 @@ /// by implicit uses on the return instructions, however, the required /// changes in the ARM backend would be quite extensive. bool Restored; + /// Flag indicating whether the register is spilled to stack or another + /// register. + bool SpilledToReg = false; public: explicit CalleeSavedInfo(unsigned R, int FI = 0) - : Reg(R), FrameIdx(FI), Restored(true) {} + : Reg(R), FrameIdx(FI), Restored(true), SpilledToReg(false) {} // Accessors. unsigned getReg() const { return Reg; } int getFrameIdx() const { return FrameIdx; } + unsigned getDstReg() const { return DstReg; } void setFrameIdx(int FI) { FrameIdx = FI; } + void setDstReg(unsigned SpillReg) { DstReg = SpillReg; } bool isRestored() const { return Restored; } void setRestored(bool R) { Restored = R; } + bool isSpilledToReg() const { return SpilledToReg; } + void setSpilledToReg(bool R) { SpilledToReg = R; } + }; /// The MachineFrameInfo class represents an abstract stack frame until @@ -247,9 +260,9 @@ unsigned MaxCallFrameSize = ~0u; /// The prolog/epilog code inserter fills in this vector with each - /// callee saved register saved in the frame. Beyond its use by the prolog/ - /// epilog code inserter, this data used for debug info and exception - /// handling. + /// callee saved register saved in either the frame or a different + /// register. Beyond its use by the prolog/ epilog code inserter, + /// this data used for debug info and exception handling. std::vector CSInfo; /// Has CSInfo been set yet? Index: lib/CodeGen/PrologEpilogInserter.cpp =================================================================== --- lib/CodeGen/PrologEpilogInserter.cpp +++ lib/CodeGen/PrologEpilogInserter.cpp @@ -76,6 +76,10 @@ using MBBVector = SmallVector; +STATISTIC(NumLeafFuncWithSpills, "Number of leaf functions with CSRs"); +STATISTIC(NumFuncSeen, "Number of functions seen in PEI"); + + namespace { class PEI : public MachineFunctionPass { @@ -171,6 +175,7 @@ /// runOnMachineFunction - Insert prolog/epilog code and replace abstract /// frame indexes with appropriate references. bool PEI::runOnMachineFunction(MachineFunction &Fn) { + NumFuncSeen++; const Function &F = Fn.getFunction(); const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo(); const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering(); @@ -457,7 +462,22 @@ if (!MRI.isReserved(Reg) && !MBB->isLiveIn(Reg)) MBB->addLiveIn(Reg); } + // If callee-saved register is spilled to another register rather than + // spilling to stack, the destination register has to be marked as live for + // each MBB between the prologue and epilogue so that it is not clobbered + // before it is reloaded in the epilogue. The Visited set contains all + // blocks outside the prologue and epilogue. + if (CSI[i].isSpilledToReg()) { + for (MachineBasicBlock &MBB : MF) { + if (Visited.count(&MBB)) + continue; + MCPhysReg DstReg = CSI[i].getDstReg(); + if (!MBB.isLiveIn(DstReg)) + MBB.addLiveIn(DstReg); + } + } } + } /// Insert restore code for the callee-saved registers used in the function. @@ -533,6 +553,9 @@ std::vector &CSI = MFI.getCalleeSavedInfo(); if (!CSI.empty()) { + if (!MFI.hasCalls()) + NumLeafFuncWithSpills++; + for (MachineBasicBlock *SaveBlock : SaveBlocks) { insertCSRSaves(*SaveBlock, CSI); // Update the live-in information of all the blocks up to the save Index: lib/Target/PowerPC/PPCFrameLowering.h =================================================================== --- lib/Target/PowerPC/PPCFrameLowering.h +++ lib/Target/PowerPC/PPCFrameLowering.h @@ -99,6 +99,13 @@ MachineBasicBlock::iterator MI, const std::vector &CSI, const TargetRegisterInfo *TRI) const override; + /// This function will assign callee saved gprs to volatile vector registers + /// for prologue spills when applicable. If it cannot spill to a volatile + /// vector register, it will assign a stack frame index. + bool + assignCalleeSavedSpillSlots(MachineFunction &MF, + const TargetRegisterInfo *TRI, + std::vector &CSI) const override; MachineBasicBlock::iterator eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, Index: lib/Target/PowerPC/PPCFrameLowering.cpp =================================================================== --- lib/Target/PowerPC/PPCFrameLowering.cpp +++ lib/Target/PowerPC/PPCFrameLowering.cpp @@ -17,6 +17,7 @@ #include "PPCMachineFunctionInfo.h" #include "PPCSubtarget.h" #include "PPCTargetMachine.h" +#include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -28,6 +29,22 @@ using namespace llvm; +#define DEBUG_TYPE "framelowering" +STATISTIC(NumNoNeedForFrame, "Number of functions without frames"); +STATISTIC(NumPESpillVSR, "Number of spills to vector in prologue"); +STATISTIC(NumPEReloadVSR, "Number of reloads from vector in epilogue"); + +static cl::opt +EnablePEVectorSpills("ppc-enable-pe-vector-spills", + cl::desc("Enable spills in prologue to vector registers."), + cl::init(false), cl::Hidden); + +static const MCPhysReg VolatileVFRegNo[] = { + PPC::F0, PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, + PPC::F7, PPC::F8, PPC::F9, PPC::F10, PPC::F11, PPC::F12, PPC::F13, + PPC::VF0, PPC::VF1, PPC::VF2, PPC::VF3, PPC::VF4, PPC::VF5, PPC::VF6, + PPC::VF7, PPC::VF8, PPC::VF9, PPC::VF10, PPC::VF11, PPC::VF12, PPC::VF13, + PPC::VF14, PPC::VF15, PPC::VF16, PPC::VF17, PPC::VF18, PPC::VF19}; /// VRRegNo - Map from a numbered VR register to its enum value. /// static const MCPhysReg VRRegNo[] = { @@ -446,6 +463,7 @@ // Check whether we can skip adjusting the stack pointer (by using red zone) if (!DisableRedZone && CanUseRedZone && FitsInRedZone) { + NumNoNeedForFrame++; // No need for frame if (UpdateMF) MFI.setStackSize(0); @@ -1193,11 +1211,20 @@ continue; } - int Offset = MFI.getObjectOffset(CSI[I].getFrameIdx()); - unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset( - nullptr, MRI->getDwarfRegNum(Reg, true), Offset)); - BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); + if (CSI[I].isSpilledToReg()) { + unsigned SpilledReg = CSI[I].getDstReg(); + unsigned CFIRegister = MF.addFrameInst(MCCFIInstruction::createRegister( + nullptr, MRI->getDwarfRegNum(Reg, true), + MRI->getDwarfRegNum(SpilledReg, true))); + BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIRegister); + } else { + int Offset = MFI.getObjectOffset(CSI[I].getFrameIdx()); + unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset( + nullptr, MRI->getDwarfRegNum(Reg, true), Offset)); + BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } } } } @@ -1798,17 +1825,19 @@ // Move general register save area spill slots down, taking into account // the size of the Floating-point register save area. for (unsigned i = 0, e = GPRegs.size(); i != e; ++i) { - int FI = GPRegs[i].getFrameIdx(); - - MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI)); + if (!GPRegs[i].isSpilledToReg()) { + int FI = GPRegs[i].getFrameIdx(); + MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI)); + } } // Move general register save area spill slots down, taking into account // the size of the Floating-point register save area. for (unsigned i = 0, e = G8Regs.size(); i != e; ++i) { - int FI = G8Regs[i].getFrameIdx(); - - MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI)); + if (!G8Regs[i].isSpilledToReg()) { + int FI = G8Regs[i].getFrameIdx(); + MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI)); + } } unsigned MinReg = @@ -1921,6 +1950,84 @@ } } +// This function checks if a callee saved gpr can be spilled to a volatile +// vector register. This occurs for leaf functions that do not need CFI and +// when option ppc-enable-pe-vector-spills is enabled. If a free volatile +// vector register is not found, assign a FrameIdx to spill to stack. +bool PPCFrameLowering::assignCalleeSavedSpillSlots( + MachineFunction &MF, const TargetRegisterInfo *TRI, + std::vector &CSI) const { + + if (CSI.empty()) + return true; // Early exit if no callee saved registers are modified! + + // Early exit if cannot spill gprs to volatile vector registers. + MachineFrameInfo &MFI = MF.getFrameInfo(); + if (!EnablePEVectorSpills || MFI.hasCalls() || !Subtarget.hasP9Vector()) + return false; + + unsigned NumFixedSpillSlots; + const PPCFrameLowering::SpillSlot *FixedSpillSlots = + getCalleeSavedSpillSlots(NumFixedSpillSlots); + + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + unsigned VolatileVFRegNoIdx = 0; + for (auto &CS : CSI) { + unsigned Reg = CS.getReg(); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + + // Check if this CSR can be spilled to a volatile vector register. + if (PPC::G8RCRegClass.contains(Reg) || PPC::GPRCRegClass.contains(Reg)) { + for (unsigned i = VolatileVFRegNoIdx; i < 34; ++i) { + if (!MRI.isPhysRegUsed(VolatileVFRegNo[i])) { + VolatileVFRegNoIdx = i + 1; + CS.setSpilledToReg(true); + CS.setDstReg(VolatileVFRegNo[i]); + break; + } + } + } + + // If CSR not spilled to volatile vector register, assign a frame index. + if (!CS.isSpilledToReg()) { + int FrameIdx; + if (TRI->hasReservedSpillSlot(MF, Reg, FrameIdx)) { + CS.setFrameIdx(FrameIdx); + continue; + } + + // Check to see if this physreg must be spilled to a particular stack + // slot. + const PPCFrameLowering::SpillSlot *FixedSlot = FixedSpillSlots; + while (FixedSlot != FixedSpillSlots + NumFixedSpillSlots && + FixedSlot->Reg != Reg) + ++FixedSlot; + + unsigned Size = TRI->getSpillSize(*RC); + if (FixedSlot == FixedSpillSlots + NumFixedSpillSlots) { + // Nope, just spill it anywhere convenient. + unsigned Align = TRI->getSpillAlignment(*RC); + unsigned StackAlign = getStackAlignment(); + + // We may not be able to satisfy the desired alignment specification of + // the TargetRegisterClass if the stack alignment is smaller. Use the + // min. + Align = std::min(Align, StackAlign); + FrameIdx = MFI.CreateStackObject(Size, Align, true); + } else { + // Spill it to the stack where we must. + FrameIdx = MFI.CreateFixedSpillStackObject(Size, FixedSlot->Offset); + } + + CS.setFrameIdx(FrameIdx); + } + } + + return true; +} + + bool PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, @@ -1986,12 +2093,18 @@ CSI[i].getFrameIdx())); } } else { - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - // Use !IsLiveIn for the kill flag. - // We do not want to kill registers that are live in this function - // before their use because they will become undefined registers. - TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn, - CSI[i].getFrameIdx(), RC, TRI); + if (CSI[i].isSpilledToReg()) { + NumPESpillVSR++; + BuildMI(MBB, MI, DL, TII.get(PPC::MTVSRD), CSI[i].getDstReg()) + .addReg(Reg, getKillRegState(true)); + } else { + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + // Use !IsLiveIn for the kill flag. + // We do not want to kill registers that are live in this function + // before their use because they will become undefined registers. + TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn, + CSI[i].getFrameIdx(), RC, TRI); + } } } return true; @@ -2131,13 +2244,19 @@ CR2Spilled = CR3Spilled = CR4Spilled = false; } - // Default behavior for non-CR saves. - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(), - RC, TRI); - assert(I != MBB.begin() && - "loadRegFromStackSlot didn't insert any code!"); + if (CSI[i].isSpilledToReg()) { + DebugLoc DL; + NumPEReloadVSR++; + BuildMI(MBB, I, DL, TII.get(PPC::MFVSRD), Reg) + .addReg(CSI[i].getDstReg(), getKillRegState(true)); + } else { + // Default behavior for non-CR saves. + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI); + assert(I != MBB.begin() && + "loadRegFromStackSlot didn't insert any code!"); } + } // Insert in reverse order. if (AtStart) Index: test/CodeGen/PowerPC/prolog_vec_spills.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/prolog_vec_spills.ll @@ -0,0 +1,56 @@ +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -ppc-enable-pe-vector-spills -mcpu=pwr9 < %s | FileCheck %s -check-prefix=CHECK + +define signext i32 @test1(i32 signext %a, i32 signext %b) #0 { +entry: +; CHECK-LABEL: test1 +; CHECK: mtvsrd [[REG1:[0-9]+]], 14 +; CHECK: mtvsrd [[REG2:[0-9]+]], 15 +; CHECK: mtvsrd [[REG3:[0-9]+]], 16 +; CHECK: mffprd 16, [[REG3]] +; CHECK: mffprd 15, [[REG2]] +; CHECK: mffprd 14, [[REG1]] + %a.addr = alloca i32, align 4 + %b.addr = alloca i32, align 4 + %dst = alloca i32, align 4 + store i32 %a, i32* %a.addr, align 4 + store i32 %b, i32* %b.addr, align 4 + call void asm sideeffect "", "~{v20},~{f14},~{f0}"() + %0 = load i32, i32* %a.addr, align 4 + %1 = load i32, i32* %b.addr, align 4 + %2 = call i32 asm "add $0, $1, $2", "=r,r,r,~{r14},~{r15},~{r16}"(i32 %0, i32 %1) + store i32 %2, i32* %dst, align 4 + %3 = load i32, i32* %dst, align 4 + ret i32 %3 +} + +; Test with multiple basic blocks. +define signext i32 @test2(i32 signext %a, i32 signext %b) { +entry: +; CHECK-LABEL: test2 +; CHECK: mtvsrd [[REG1:[0-9]+]], 14 +; CHECK: mtvsrd [[REG2:[0-9]+]], 15 +; CHECK: mtvsrd [[REG3:[0-9]+]], 16 +; CHECK: mffprd 16, [[REG3]] +; CHECK: mffprd 15, [[REG2]] +; CHECK: mffprd 14, [[REG1]] + %0 = tail call i32 asm "add $0, $1, $2", "=r,r,r,~{r14},~{r15},~{r16}"(i32 %a, i32 %b) + %cmp = icmp sgt i32 %b, %a + %cmp112 = icmp sgt i32 %b, 0 + %or.cond = and i1 %cmp, %cmp112 + br i1 %or.cond, label %for.body.lr.ph, label %if.end + +for.body.lr.ph: ; preds = %entry + %add = sub i32 %b, %a + %1 = add i32 %0, %b + %2 = add i32 %b, -1 + %3 = mul i32 %add, %2 + %4 = add i32 %1, %3 + %5 = sub i32 %4, %a + br label %if.end + +if.end: ; preds = %for.body.lr.ph, %entry + %dst.1 = phi i32 [ %0, %entry ], [ %5, %for.body.lr.ph ] + ret i32 %dst.1 +} + +attributes #0 = { nounwind }