Index: lib/Target/AArch64/AArch64AsmPrinter.cpp =================================================================== --- lib/Target/AArch64/AArch64AsmPrinter.cpp +++ lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -471,6 +471,7 @@ EmitToStreamer(*OutStreamer, TmpInst); return; } + case AArch64::B_EPILOGUE: case AArch64::TCRETURNdi: { MCOperand Dest; MCInstLowering.lowerOperand(MI->getOperand(0), Dest); Index: lib/Target/AArch64/AArch64FrameLowering.h =================================================================== --- lib/Target/AArch64/AArch64FrameLowering.h +++ lib/Target/AArch64/AArch64FrameLowering.h @@ -65,6 +65,14 @@ bool enableShrinkWrapping(const MachineFunction &MF) const override { return true; } + +private: + /// Try to Replace ldp*, ret sequence with jump to shared epilogue + /// code. + bool tryJumpToSharedEpilogue(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, + const std::vector &CSI, + const TargetRegisterInfo &TRI) const; }; } // End llvm namespace Index: lib/Target/AArch64/AArch64FrameLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64FrameLowering.cpp +++ lib/Target/AArch64/AArch64FrameLowering.cpp @@ -101,6 +101,7 @@ #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" +#include "llvm/IR/Mangler.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -113,6 +114,9 @@ cl::desc("enable use of redzone on AArch64"), cl::init(false), cl::Hidden); +static cl::opt EnableSharedEpilogues("aarch64-shared-epilogues", + cl::desc("Use shared epilogue code in compiler-rt"), cl::Hidden); + STATISTIC(NumRedZoneFunctions, "Number of functions using red zone"); bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { @@ -531,6 +535,10 @@ MI.getOperand(RtIdx + 2).getReg() != AArch64::SP) return 0; return 2; + case AArch64::B_EPILOGUE: { + MachineOperand &MO = MI.getOperand(1); // NRegsRestored. + return MO.getImm(); + } } return 0; } @@ -607,6 +615,9 @@ unsigned NumRestores = 0; // Move past the restores of the callee-saved registers. MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator(); + // B_EPILOGUE is terminator and restores regs. + if (LastPopI != MBB.end() && LastPopI->getOpcode() == AArch64::B_EPILOGUE) + ++LastPopI; const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); MachineBasicBlock::iterator Begin = MBB.begin(); while (LastPopI != Begin) { @@ -798,8 +809,83 @@ return true; } +/// We only put the most common epilogues into compiler-rt. This function +/// evaluates whether a given CalleeSavedInfo vector produces one of them. +static bool isSharedEpilogueAvailable(const std::vector &CSI) { + unsigned Count = CSI.size(); + unsigned Prefix = 0; + if (Count % 2 != 0 || Count == 0) + return false; + // We support all variants with LR,FP as first two registers. + if (CSI[0].getReg() == AArch64::LR && CSI[1].getReg() == AArch64::FP) + Prefix += 2; + // Last two registers are X27,X28? + if (CSI[Count-2].getReg() == AArch64::X27 && + CSI[Count-1].getReg() == AArch64::X28) { + // X27+X28 is allowed as a special case. + if (Count == 2) + return true; + // Otherwise we only support variants that also started with LR,FP. + if (Prefix != 2) + return false; + Count -= 2; + } + // The remaining registers must be pairs from X19 up to X26. + static const unsigned Sequence[] = { + AArch64::X19, AArch64::X20, AArch64::X21, AArch64::X22, + AArch64::X23, AArch64::X24, AArch64::X25, AArch64::X26, + }; + if (Count-Prefix > array_lengthof(Sequence)) + return false; + for (unsigned I = Prefix; I < Count - Prefix; I += 2) { + if (CSI[I].getReg() != Sequence[I-Prefix]) + return false; + } + return true; +} + +bool AArch64FrameLowering::tryJumpToSharedEpilogue( + MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, + const std::vector &CSI, + const TargetRegisterInfo &TRI) const { + assert(InsertBefore != MBB.end()); + if (!EnableSharedEpilogues) + return false; + + unsigned OpCode = InsertBefore->getOpcode(); + if (OpCode != AArch64::RET && OpCode != AArch64::RET_ReallyLR) + return false; + + if (!isSharedEpilogueAvailable(CSI)) + return false; + + // Construct label name of epilogue code. + SmallString<60> EpilogueName("__epilogue"); + for (const CalleeSavedInfo &Info : CSI) { + unsigned Reg = Info.getReg(); + EpilogueName += '_'; + EpilogueName += TRI.getName(Reg); + } + const MachineFunction &MF = *MBB.getParent(); + const DataLayout &TD = MF.getDataLayout(); + SmallString<60> MangledEpilogueName; + Mangler::getNameWithPrefix(MangledEpilogueName, EpilogueName, TD); + + // Build jump to shared epilogue code. + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + DebugLoc DL = InsertBefore->getDebugLoc(); + const MCInstrDesc &MCID = TII.get(AArch64::B_EPILOGUE); + BuildMI(MBB, InsertBefore, DL, MCID) + .addExternalSymbol(strdup(MangledEpilogueName.c_str())) + .addImm(CSI.size()) // NRegsRestored + .copyImplicitOps(&*InsertBefore); + // Remove ret. + InsertBefore->removeFromParent(); + return true; +} + bool AArch64FrameLowering::restoreCalleeSavedRegisters( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const std::vector &CSI, const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); @@ -808,8 +894,11 @@ DebugLoc DL; assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!"); - if (MI != MBB.end()) - DL = MI->getDebugLoc(); + if (InsertBefore != MBB.end()) { + DL = InsertBefore->getDebugLoc(); + if (tryJumpToSharedEpilogue(MBB, InsertBefore, CSI, *TRI)) + return true; + } for (unsigned i = 0; i < Count; i += 2) { unsigned Reg1 = CSI[i].getReg(); @@ -855,7 +944,7 @@ const int Offset = (i == Count - 2) ? Count : Count - i - 2; assert((Offset >= -64 && Offset <= 63) && "Offset out of bounds for LDP immediate"); - MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc)); + MachineInstrBuilder MIB = BuildMI(MBB, InsertBefore, DL, TII.get(LdrOpc)); if (LdrOpc == AArch64::LDPXpost || LdrOpc == AArch64::LDPDpost) MIB.addReg(AArch64::SP, RegState::Define); Index: lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.td +++ lib/Target/AArch64/AArch64InstrInfo.td @@ -5994,4 +5994,8 @@ def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)), (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>; +let isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in { + def B_EPILOGUE : Pseudo<(outs), (ins i64imm:$dst, i32imm:$NRegsRestored),[]>; +} + include "AArch64InstrAtomics.td" Index: test/CodeGen/AArch64/shared_epilogues.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/shared_epilogues.ll @@ -0,0 +1,116 @@ +; RUN: llc -aarch64-shared-epilogues=1 -o - %s | FileCheck %s --check-prefix CHECK --check-prefix SHARED +; RUN: llc -aarch64-shared-epilogues=0 -o - %s | FileCheck %s --check-prefix CHECK --check-prefix NOSHAR +target triple="aarch64--" + +declare void @extfunc() + +; CHECK-LABEL: f0: +define void @f0() { +; CHECK: stp x29, x30, [sp, #-16] +; CHECK: bl extfunc + call void @extfunc() +; NOSHAR: ldp x29, x30, [sp], #16 +; NOSHAR-NEXT: ret +; SHARED: b __epilogue_LR_FP +; SHARED-NOT: ret + ret void +} + +@v0 = external global i32 +@v1 = external global i32 +@v2 = external global i32 +@v3 = external global i32 +@v4 = external global i32 +@v5 = external global i32 + +; CHECK-LABEL: f1: +define void @f1() { +; CHECK: stp x20, x19, [sp, #-32]! +; CHECK-NEXT: stp x29, x30, [sp, #16] + %v0 = load volatile i32, i32* @v0 +; CHECK: bl extfunc + call void @extfunc() + store volatile i32 %v0, i32* @v0 +; NOSHAR: ldp x29, x30, [sp, #16] +; NOSHAR-NEXT: ldp x20, x19, [sp], #32 +; NOSHAR-NEXT: ret +; SHARED: b __epilogue_LR_FP_X19_X20 +; SHARED-NOT: ret + ret void +} + +; CHECK-LABEL: f2: +define void @f2() { +; CHECK: stp x28, x27, [sp, #-96]! +; CHECK-NEXT: stp x26, x25, [sp, #16] +; CHECK-NEXT: stp x24, x23, [sp, #32] +; CHECK-NEXT: stp x22, x21, [sp, #48] +; CHECK-NEXT: stp x20, x19, [sp, #64] +; CHECK-NEXT: stp x29, x30, [sp, #80] + %v0 = load volatile i32, i32* @v0 + %v1 = load volatile i32, i32* @v1 + %v2 = load volatile i32, i32* @v2 + %v3 = load volatile i32, i32* @v3 + %v4 = load volatile i32, i32* @v4 + %v5 = load volatile i32, i32* @v5 +; CHECK: bl extfunc + call void @extfunc() + store volatile i32 %v0, i32* @v0 + store volatile i32 %v1, i32* @v1 + store volatile i32 %v2, i32* @v2 + store volatile i32 %v3, i32* @v3 + store volatile i32 %v4, i32* @v4 + store volatile i32 %v5, i32* @v5 +; NOSHARE: ldp x29, x30, [sp, #80] +; NOSHARE-NEXT: ldp x20, x19, [sp, #64] +; NOSHARE-NEXT: ldp x22, x21, [sp, #48] +; NOSHARE-NEXT: ldp x24, x23, [sp, #32] +; NOSHARE-NEXT: ldp x26, x25, [sp, #16] +; NOSHARE-NEXT: ldp x28, x27, [sp], #96 +; NOSHARE-NEXT: ret +; SHARED: b __epilogue_LR_FP_X19_X20_X21_X22_X23_X24_X25_X26_X27_X28 +; SHARED-NOT: ret + ret void +} + +; CHECK-LABEL: a0: +define void @a0() { + call void asm sideeffect "", "~{x19},~{x20},~{x21},~{x22}"() +; NOSHARE: ldp x20, x19, [sp, #16] +; NOSHARE-NEXT: ldp x22, x21, [sp], #32 +; NOSHARE-NEXT: ret +; SHARED: b __epilogue_X19_X20_X21_X22 +; SHARED-NOT: ret + ret void +} + +; CHECK-LABEL: a1: +define void @a1() { + call void asm sideeffect "", "~{x27},~{x28}"() +; NOSHARE: ldp x28, x27, [sp], #16 +; NOSHARE-NEXT: ret +; SHARED: b __epilogue_X27_X28 +; SHARED-NOT: ret + ret void +} + +; CHECK-LABEL: a2: +define void @a2() { + call void asm sideeffect "", "~{x25},~{x26}"() +; This is epilogue pattern is not present in compiler-rt +; CHECK-NOT: b __epilogue +; CHECK: ldp x26, x25, [sp], #16 +; CHECK-NEXT: ret + ret void +} + +; CHECK-LABEL: a3: +define void @a3() { + call void asm sideeffect "", "~{X19},~{X20},~{x27},~{x28}"() +; This is epilogue pattern is not present in compiler-rt +; CHECK-NOT: b __epilogue +; CHECK: ldp x20, x19, [sp, #16] +; CHECK-NEXT: ldp x28, x27, [sp], #32 +; CHECK-NEXT: ret + ret void +}