Index: include/llvm/CodeGen/MachineFrameInfo.h =================================================================== --- include/llvm/CodeGen/MachineFrameInfo.h +++ include/llvm/CodeGen/MachineFrameInfo.h @@ -28,9 +28,14 @@ /// The CalleeSavedInfo class tracks the information need to locate where a /// callee saved register is in the current frame. +/// Callee saved reg can also be saved to a different register rather than +/// on the stack by setting DstReg instead of FrameIdx. class CalleeSavedInfo { unsigned Reg; - int FrameIdx; + union { + int FrameIdx; + unsigned DstReg; + }; /// Flag indicating whether the register is actually restored in the epilog. /// In most cases, if a register is saved, it is also restored. There are /// some situations, though, when this is not the case. For example, the @@ -44,17 +49,29 @@ /// by implicit uses on the return instructions, however, the required /// changes in the ARM backend would be quite extensive. bool Restored; + /// Flag indicating whether the register is spilled to stack or another + /// register. + bool SpilledToReg = false; public: explicit CalleeSavedInfo(unsigned R, int FI = 0) - : Reg(R), FrameIdx(FI), Restored(true) {} + : Reg(R), FrameIdx(FI), Restored(true), SpilledToReg(false) {} // Accessors. unsigned getReg() const { return Reg; } int getFrameIdx() const { return FrameIdx; } - void setFrameIdx(int FI) { FrameIdx = FI; } + unsigned getDstReg() const { return DstReg; } + void setFrameIdx(int FI) { + FrameIdx = FI; + SpilledToReg = false; + } + void setDstReg(unsigned SpillReg) { + DstReg = SpillReg; + SpilledToReg = true; + } bool isRestored() const { return Restored; } void setRestored(bool R) { Restored = R; } + bool isSpilledToReg() const { return SpilledToReg; } }; /// The MachineFrameInfo class represents an abstract stack frame until @@ -250,9 +267,9 @@ unsigned MaxCallFrameSize = ~0u; /// The prolog/epilog code inserter fills in this vector with each - /// callee saved register saved in the frame. Beyond its use by the prolog/ - /// epilog code inserter, this data used for debug info and exception - /// handling. + /// callee saved register saved in either the frame or a different + /// register. Beyond its use by the prolog/ epilog code inserter, + /// this data is used for debug info and exception handling. std::vector CSInfo; /// Has CSInfo been set yet? Index: lib/CodeGen/PrologEpilogInserter.cpp =================================================================== --- lib/CodeGen/PrologEpilogInserter.cpp +++ lib/CodeGen/PrologEpilogInserter.cpp @@ -76,6 +76,10 @@ using MBBVector = SmallVector; +STATISTIC(NumLeafFuncWithSpills, "Number of leaf functions with CSRs"); +STATISTIC(NumFuncSeen, "Number of functions seen in PEI"); + + namespace { class PEI : public MachineFunctionPass { @@ -171,6 +175,7 @@ /// runOnMachineFunction - Insert prolog/epilog code and replace abstract /// frame indexes with appropriate references. bool PEI::runOnMachineFunction(MachineFunction &MF) { + NumFuncSeen++; const Function &F = MF.getFunction(); const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); @@ -360,6 +365,11 @@ // Now that we know which registers need to be saved and restored, allocate // stack slots for them. for (auto &CS : CSI) { + // If the target has spilled this register to another register, we don't + // need to allocate a stack slot. + if (CS.isSpilledToReg()) + continue; + unsigned Reg = CS.getReg(); const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg); @@ -457,7 +467,22 @@ if (!MRI.isReserved(Reg) && !MBB->isLiveIn(Reg)) MBB->addLiveIn(Reg); } + // If callee-saved register is spilled to another register rather than + // spilling to stack, the destination register has to be marked as live for + // each MBB between the prologue and epilogue so that it is not clobbered + // before it is reloaded in the epilogue. The Visited set contains all + // blocks outside the prologue and epilogue. + if (CSI[i].isSpilledToReg()) { + for (MachineBasicBlock &MBB : MF) { + if (Visited.count(&MBB)) + continue; + MCPhysReg DstReg = CSI[i].getDstReg(); + if (!MBB.isLiveIn(DstReg)) + MBB.addLiveIn(DstReg); + } + } } + } /// Insert restore code for the callee-saved registers used in the function. @@ -533,6 +558,9 @@ std::vector &CSI = MFI.getCalleeSavedInfo(); if (!CSI.empty()) { + if (!MFI.hasCalls()) + NumLeafFuncWithSpills++; + for (MachineBasicBlock *SaveBlock : SaveBlocks) { insertCSRSaves(*SaveBlock, CSI); // Update the live-in information of all the blocks up to the save Index: lib/Target/PowerPC/PPCFrameLowering.h =================================================================== --- lib/Target/PowerPC/PPCFrameLowering.h +++ lib/Target/PowerPC/PPCFrameLowering.h @@ -99,6 +99,13 @@ MachineBasicBlock::iterator MI, const std::vector &CSI, const TargetRegisterInfo *TRI) const override; + /// This function will assign callee saved gprs to volatile vector registers + /// for prologue spills when applicable. It returns false if there are any + /// registers which were not spilled to volatile vector registers. + bool + assignCalleeSavedSpillSlots(MachineFunction &MF, + const TargetRegisterInfo *TRI, + std::vector &CSI) const override; MachineBasicBlock::iterator eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, Index: lib/Target/PowerPC/PPCFrameLowering.cpp =================================================================== --- lib/Target/PowerPC/PPCFrameLowering.cpp +++ lib/Target/PowerPC/PPCFrameLowering.cpp @@ -17,6 +17,7 @@ #include "PPCMachineFunctionInfo.h" #include "PPCSubtarget.h" #include "PPCTargetMachine.h" +#include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -28,6 +29,16 @@ using namespace llvm; +#define DEBUG_TYPE "framelowering" +STATISTIC(NumNoNeedForFrame, "Number of functions without frames"); +STATISTIC(NumPESpillVSR, "Number of spills to vector in prologue"); +STATISTIC(NumPEReloadVSR, "Number of reloads from vector in epilogue"); + +static cl::opt +EnablePEVectorSpills("ppc-enable-pe-vector-spills", + cl::desc("Enable spills in prologue to vector registers."), + cl::init(true), cl::Hidden); + /// VRRegNo - Map from a numbered VR register to its enum value. /// static const MCPhysReg VRRegNo[] = { @@ -446,6 +457,7 @@ // Check whether we can skip adjusting the stack pointer (by using red zone) if (!DisableRedZone && CanUseRedZone && FitsInRedZone) { + NumNoNeedForFrame++; // No need for frame if (UpdateMF) MFI.setStackSize(0); @@ -1193,11 +1205,20 @@ continue; } - int Offset = MFI.getObjectOffset(CSI[I].getFrameIdx()); - unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset( - nullptr, MRI->getDwarfRegNum(Reg, true), Offset)); - BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); + if (CSI[I].isSpilledToReg()) { + unsigned SpilledReg = CSI[I].getDstReg(); + unsigned CFIRegister = MF.addFrameInst(MCCFIInstruction::createRegister( + nullptr, MRI->getDwarfRegNum(Reg, true), + MRI->getDwarfRegNum(SpilledReg, true))); + BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIRegister); + } else { + int Offset = MFI.getObjectOffset(CSI[I].getFrameIdx()); + unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset( + nullptr, MRI->getDwarfRegNum(Reg, true), Offset)); + BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } } } } @@ -1798,17 +1819,19 @@ // Move general register save area spill slots down, taking into account // the size of the Floating-point register save area. for (unsigned i = 0, e = GPRegs.size(); i != e; ++i) { - int FI = GPRegs[i].getFrameIdx(); - - MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI)); + if (!GPRegs[i].isSpilledToReg()) { + int FI = GPRegs[i].getFrameIdx(); + MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI)); + } } // Move general register save area spill slots down, taking into account // the size of the Floating-point register save area. for (unsigned i = 0, e = G8Regs.size(); i != e; ++i) { - int FI = G8Regs[i].getFrameIdx(); - - MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI)); + if (!G8Regs[i].isSpilledToReg()) { + int FI = G8Regs[i].getFrameIdx(); + MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI)); + } } unsigned MinReg = @@ -1921,6 +1944,65 @@ } } +// This function checks if a callee saved gpr can be spilled to a volatile +// vector register. This occurs for leaf functions when the option +// ppc-enable-pe-vector-spills is enabled. If there are any remaining registers +// which were not spilled to vectors, return false so the target independent +// code can handle them by assigning a FrameIdx to a stack slot. +bool PPCFrameLowering::assignCalleeSavedSpillSlots( + MachineFunction &MF, const TargetRegisterInfo *TRI, + std::vector &CSI) const { + + if (CSI.empty()) + return true; // Early exit if no callee saved registers are modified! + + // Early exit if cannot spill gprs to volatile vector registers. + MachineFrameInfo &MFI = MF.getFrameInfo(); + if (!EnablePEVectorSpills || MFI.hasCalls() || !Subtarget.hasP9Vector()) + return false; + + // Build a BitVector of VSRs that can be used for spilling GPRs. + BitVector BVAllocatable = TRI->getAllocatableSet(MF); + BitVector BVCalleeSaved(TRI->getNumRegs()); + const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); + for (unsigned i = 0; CSRegs[i]; ++i) + BVCalleeSaved.set(CSRegs[i]); + + for (unsigned Reg = BVAllocatable.find_first(); Reg < BVAllocatable.size(); + Reg = BVAllocatable.find_next(Reg)) { + // Set to 0 if the register is not a volatile VF/F8 register, or if it is + // used in the function. + if (BVCalleeSaved[Reg] || + (!PPC::F8RCRegClass.contains(Reg) && + !PPC::VFRCRegClass.contains(Reg)) || + (MF.getRegInfo().isPhysRegUsed(Reg))) + BVAllocatable.reset(Reg); + } + + bool AllSpilledToReg = true; + for (auto &CS : CSI) { + if (BVAllocatable.none()) + return false; + + unsigned Reg = CS.getReg(); + if (!PPC::G8RCRegClass.contains(Reg) && !PPC::GPRCRegClass.contains(Reg)) { + AllSpilledToReg = false; + continue; + } + + unsigned VolatileVFReg = BVAllocatable.find_first(); + if (VolatileVFReg < BVAllocatable.size()) { + CS.setDstReg(VolatileVFReg); + BVAllocatable.reset(VolatileVFReg); + } else { + AllSpilledToReg = false; + } + } + return AllSpilledToReg; +} + + bool PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, @@ -1986,12 +2068,18 @@ CSI[i].getFrameIdx())); } } else { - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - // Use !IsLiveIn for the kill flag. - // We do not want to kill registers that are live in this function - // before their use because they will become undefined registers. - TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn, - CSI[i].getFrameIdx(), RC, TRI); + if (CSI[i].isSpilledToReg()) { + NumPESpillVSR++; + BuildMI(MBB, MI, DL, TII.get(PPC::MTVSRD), CSI[i].getDstReg()) + .addReg(Reg, getKillRegState(true)); + } else { + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + // Use !IsLiveIn for the kill flag. + // We do not want to kill registers that are live in this function + // before their use because they will become undefined registers. + TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn, + CSI[i].getFrameIdx(), RC, TRI); + } } } return true; @@ -2131,13 +2219,19 @@ CR2Spilled = CR3Spilled = CR4Spilled = false; } - // Default behavior for non-CR saves. - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(), - RC, TRI); - assert(I != MBB.begin() && - "loadRegFromStackSlot didn't insert any code!"); + if (CSI[i].isSpilledToReg()) { + DebugLoc DL; + NumPEReloadVSR++; + BuildMI(MBB, I, DL, TII.get(PPC::MFVSRD), Reg) + .addReg(CSI[i].getDstReg(), getKillRegState(true)); + } else { + // Default behavior for non-CR saves. + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI); + assert(I != MBB.begin() && + "loadRegFromStackSlot didn't insert any code!"); } + } // Insert in reverse order. if (AtStart) Index: test/CodeGen/PowerPC/prolog_vec_spills.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/prolog_vec_spills.ll @@ -0,0 +1,56 @@ +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -ppc-enable-pe-vector-spills -mcpu=pwr9 < %s | FileCheck %s -check-prefix=CHECK + +define signext i32 @test1(i32 signext %a, i32 signext %b) #0 { +entry: +; CHECK-LABEL: test1 +; CHECK: mtvsrd [[REG1:[0-9]+]], 14 +; CHECK: mtvsrd [[REG2:[0-9]+]], 15 +; CHECK: mtvsrd [[REG3:[0-9]+]], 16 +; CHECK: mffprd 16, [[REG3]] +; CHECK: mffprd 15, [[REG2]] +; CHECK: mffprd 14, [[REG1]] + %a.addr = alloca i32, align 4 + %b.addr = alloca i32, align 4 + %dst = alloca i32, align 4 + store i32 %a, i32* %a.addr, align 4 + store i32 %b, i32* %b.addr, align 4 + call void asm sideeffect "", "~{v20},~{f14},~{f0}"() + %0 = load i32, i32* %a.addr, align 4 + %1 = load i32, i32* %b.addr, align 4 + %2 = call i32 asm "add $0, $1, $2", "=r,r,r,~{r14},~{r15},~{r16}"(i32 %0, i32 %1) + store i32 %2, i32* %dst, align 4 + %3 = load i32, i32* %dst, align 4 + ret i32 %3 +} + +; Test with multiple basic blocks. +define signext i32 @test2(i32 signext %a, i32 signext %b) { +entry: +; CHECK-LABEL: test2 +; CHECK: mtvsrd [[REG1:[0-9]+]], 14 +; CHECK: mtvsrd [[REG2:[0-9]+]], 15 +; CHECK: mtvsrd [[REG3:[0-9]+]], 16 +; CHECK: mffprd 16, [[REG3]] +; CHECK: mffprd 15, [[REG2]] +; CHECK: mffprd 14, [[REG1]] + %0 = tail call i32 asm "add $0, $1, $2", "=r,r,r,~{r14},~{r15},~{r16}"(i32 %a, i32 %b) + %cmp = icmp sgt i32 %b, %a + %cmp112 = icmp sgt i32 %b, 0 + %or.cond = and i1 %cmp, %cmp112 + br i1 %or.cond, label %for.body.lr.ph, label %if.end + +for.body.lr.ph: ; preds = %entry + %add = sub i32 %b, %a + %1 = add i32 %0, %b + %2 = add i32 %b, -1 + %3 = mul i32 %add, %2 + %4 = add i32 %1, %3 + %5 = sub i32 %4, %a + br label %if.end + +if.end: ; preds = %for.body.lr.ph, %entry + %dst.1 = phi i32 [ %0, %entry ], [ %5, %for.body.lr.ph ] + ret i32 %dst.1 +} + +attributes #0 = { nounwind }