Index: include/llvm/CodeGen/MachineOutliner.h =================================================================== --- include/llvm/CodeGen/MachineOutliner.h +++ include/llvm/CodeGen/MachineOutliner.h @@ -19,6 +19,7 @@ #include "llvm/CodeGen/LiveRegUnits.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/LivePhysRegs.h" namespace llvm { namespace outliner { @@ -74,6 +75,13 @@ /// cost model information. LiveRegUnits LRU; + /// Contains the accumulated register liveness information for the + /// instructions in this \p Candidate. + /// + /// This is optionally used by the target to determine which registers have + /// been used across the sequence. + LiveRegUnits UsedInSequence; + /// Return the number of instructions in this Candidate. unsigned getLength() const { return Len; } @@ -137,6 +145,12 @@ // outlining candidate. std::for_each(MBB->rbegin(), (MachineBasicBlock::reverse_iterator)front(), [this](MachineInstr &MI) { LRU.stepBackward(MI); }); + + // Walk over the sequence itself and figure out which registers were used + // in the sequence. + UsedInSequence.init(TRI); + std::for_each(front(), std::next(back()), + [this](MachineInstr &MI) { UsedInSequence.accumulate(MI); }); } }; Index: lib/Target/AArch64/AArch64InstrInfo.h =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.h +++ lib/Target/AArch64/AArch64InstrInfo.h @@ -272,6 +272,10 @@ ArrayRef Cond) const; bool substituteCmpToZero(MachineInstr &CmpInstr, unsigned SrcReg, const MachineRegisterInfo *MRI) const; + + /// Returns an unused general-purpose register which can be used for + /// constructing an outlined call if one exists. Returns 0 otherwise. + unsigned findRegisterToSaveLRTo(const outliner::Candidate &C) const; }; /// emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg Index: lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.cpp +++ lib/Target/AArch64/AArch64InstrInfo.cpp @@ -4851,75 +4851,92 @@ return makeArrayRef(TargetFlags); } - /// Constants defining how certain sequences should be outlined. - /// This encompasses how an outlined function should be called, and what kind of - /// frame should be emitted for that outlined function. - /// - /// \p MachineOutlinerDefault implies that the function should be called with - /// a save and restore of LR to the stack. - /// - /// That is, - /// - /// I1 Save LR OUTLINED_FUNCTION: - /// I2 --> BL OUTLINED_FUNCTION I1 - /// I3 Restore LR I2 - /// I3 - /// RET - /// - /// * Call construction overhead: 3 (save + BL + restore) - /// * Frame construction overhead: 1 (ret) - /// * Requires stack fixups? Yes - /// - /// \p MachineOutlinerTailCall implies that the function is being created from - /// a sequence of instructions ending in a return. - /// - /// That is, - /// - /// I1 OUTLINED_FUNCTION: - /// I2 --> B OUTLINED_FUNCTION I1 - /// RET I2 - /// RET - /// - /// * Call construction overhead: 1 (B) - /// * Frame construction overhead: 0 (Return included in sequence) - /// * Requires stack fixups? No - /// - /// \p MachineOutlinerNoLRSave implies that the function should be called using - /// a BL instruction, but doesn't require LR to be saved and restored. This - /// happens when LR is known to be dead. - /// - /// That is, - /// - /// I1 OUTLINED_FUNCTION: - /// I2 --> BL OUTLINED_FUNCTION I1 - /// I3 I2 - /// I3 - /// RET - /// - /// * Call construction overhead: 1 (BL) - /// * Frame construction overhead: 1 (RET) - /// * Requires stack fixups? No - /// - /// \p MachineOutlinerThunk implies that the function is being created from - /// a sequence of instructions ending in a call. The outlined function is - /// called with a BL instruction, and the outlined function tail-calls the - /// original call destination. - /// - /// That is, - /// - /// I1 OUTLINED_FUNCTION: - /// I2 --> BL OUTLINED_FUNCTION I1 - /// BL f I2 - /// B f - /// * Call construction overhead: 1 (BL) - /// * Frame construction overhead: 0 - /// * Requires stack fixups? No - /// +/// Constants defining how certain sequences should be outlined. +/// This encompasses how an outlined function should be called, and what kind of +/// frame should be emitted for that outlined function. +/// +/// \p MachineOutlinerDefault implies that the function should be called with +/// a save and restore of LR to the stack. +/// +/// That is, +/// +/// I1 Save LR OUTLINED_FUNCTION: +/// I2 --> BL OUTLINED_FUNCTION I1 +/// I3 Restore LR I2 +/// I3 +/// RET +/// +/// * Call construction overhead: 3 (save + BL + restore) +/// * Frame construction overhead: 1 (ret) +/// * Requires stack fixups? Yes +/// +/// \p MachineOutlinerTailCall implies that the function is being created from +/// a sequence of instructions ending in a return. +/// +/// That is, +/// +/// I1 OUTLINED_FUNCTION: +/// I2 --> B OUTLINED_FUNCTION I1 +/// RET I2 +/// RET +/// +/// * Call construction overhead: 1 (B) +/// * Frame construction overhead: 0 (Return included in sequence) +/// * Requires stack fixups? No +/// +/// \p MachineOutlinerNoLRSave implies that the function should be called using +/// a BL instruction, but doesn't require LR to be saved and restored. This +/// happens when LR is known to be dead. +/// +/// That is, +/// +/// I1 OUTLINED_FUNCTION: +/// I2 --> BL OUTLINED_FUNCTION I1 +/// I3 I2 +/// I3 +/// RET +/// +/// * Call construction overhead: 1 (BL) +/// * Frame construction overhead: 1 (RET) +/// * Requires stack fixups? No +/// +/// \p MachineOutlinerThunk implies that the function is being created from +/// a sequence of instructions ending in a call. The outlined function is +/// called with a BL instruction, and the outlined function tail-calls the +/// original call destination. +/// +/// That is, +/// +/// I1 OUTLINED_FUNCTION: +/// I2 --> BL OUTLINED_FUNCTION I1 +/// BL f I2 +/// B f +/// * Call construction overhead: 1 (BL) +/// * Frame construction overhead: 0 +/// * Requires stack fixups? No +/// +/// \p MachineOutlinerRegSave implies that the function should be called with a +/// save and restore of LR to an available register. This allows us to avoid +/// stack fixups. Note that this outlining variant is compatible with the +/// NoLRSave case. +/// +/// That is, +/// +/// I1 Save LR OUTLINED_FUNCTION: +/// I2 --> BL OUTLINED_FUNCTION I1 +/// I3 Restore LR I2 +/// I3 +/// RET +/// +/// * Call construction overhead: 3 (save + BL + restore) +/// * Frame construction overhead: 1 (ret) +/// * Requires stack fixups? No enum MachineOutlinerClass { MachineOutlinerDefault, /// Emit a save, restore, call, and return. MachineOutlinerTailCall, /// Only emit a branch. MachineOutlinerNoLRSave, /// Emit a call and return. MachineOutlinerThunk, /// Emit a call and tail-call. + MachineOutlinerRegSave /// Same as default, but save to a register. }; enum MachineOutlinerMBBFlags { @@ -4927,6 +4944,27 @@ HasCalls = 0x4 }; +unsigned +AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const { + MachineFunction *MF = C.getMF(); + const AArch64RegisterInfo *ARI = static_cast( + MF->getSubtarget().getRegisterInfo()); + + // Check if there is an available register across the sequence that we can + // use. + for (unsigned Reg : AArch64::GPR64RegClass) { + if (!ARI->isReservedReg(*MF, Reg) && + Reg != AArch64::LR && // LR is not reserved, but don't use it. + Reg != AArch64::X16 && // X16 is not guaranteed to be preserved. + Reg != AArch64::X17 && // Ditto for X17. + C.LRU.available(Reg) && C.UsedInSequence.available(Reg)) + return Reg; + } + + // No suitable register. Return 0. + return 0u; +} + outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo( std::vector &RepeatedSequenceLocs) const { @@ -5015,11 +5053,27 @@ SetCandidateCallInfo(MachineOutlinerNoLRSave, 4); } - // LR is live, so we need to save it to the stack. + // LR is live, so we need to save it. Decide whether it should be saved to + // the stack, or if it can be saved to a register. else { - FrameID = MachineOutlinerDefault; - NumBytesToCreateFrame = 4; - SetCandidateCallInfo(MachineOutlinerDefault, 12); + if (std::all_of(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(), + [this](outliner::Candidate &C) { + return findRegisterToSaveLRTo(C); + })) { + // Every candidate has an available callee-saved register for the save. + // We can save LR to a register. + FrameID = MachineOutlinerRegSave; + NumBytesToCreateFrame = 4; + SetCandidateCallInfo(MachineOutlinerRegSave, 12); + } + + else { + // At least one candidate does not have an available callee-saved + // register. We must save LR to the stack. + FrameID = MachineOutlinerDefault; + NumBytesToCreateFrame = 4; + SetCandidateCallInfo(MachineOutlinerDefault, 12); + } } // Check if the range contains a call. These require a save + restore of the @@ -5424,7 +5478,7 @@ MBB.insert(MBB.end(), ret); // Did we have to modify the stack by saving the link register? - if (OF.FrameConstructionID == MachineOutlinerNoLRSave) + if (OF.FrameConstructionID != MachineOutlinerDefault) return; // We modified the stack. @@ -5457,13 +5511,41 @@ // We want to return the spot where we inserted the call. MachineBasicBlock::iterator CallPt; - // We have a default call. Save the link register. - MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) - .addReg(AArch64::SP, RegState::Define) - .addReg(AArch64::LR) - .addReg(AArch64::SP) - .addImm(-16); - It = MBB.insert(It, STRXpre); + // Instructions for saving and restoring LR around the call instruction we're + // going to insert. + MachineInstr *Save; + MachineInstr *Restore; + // Can we save to a register? + if (C.CallConstructionID == MachineOutlinerRegSave) { + // FIXME: This logic should be sunk into a target-specific interface so that + // we don't have to recompute the register. + unsigned Reg = findRegisterToSaveLRTo(C); + assert(Reg != 0 && "No callee-saved register available?"); + + // Save and restore LR from that register. + Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg) + .addReg(AArch64::XZR) + .addReg(AArch64::LR) + .addImm(0); + Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR) + .addReg(AArch64::XZR) + .addReg(Reg) + .addImm(0); + } else { + // We have the default case. Save and restore from SP. + Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) + .addReg(AArch64::SP, RegState::Define) + .addReg(AArch64::LR) + .addReg(AArch64::SP) + .addImm(-16); + Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) + .addReg(AArch64::SP, RegState::Define) + .addReg(AArch64::LR, RegState::Define) + .addReg(AArch64::SP) + .addImm(16); + } + + It = MBB.insert(It, Save); It++; // Insert the call. @@ -5472,14 +5554,7 @@ CallPt = It; It++; - // Restore the link register. - MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) - .addReg(AArch64::SP, RegState::Define) - .addReg(AArch64::LR, RegState::Define) - .addReg(AArch64::SP) - .addImm(16); - It = MBB.insert(It, LDRXpost); - + It = MBB.insert(It, Restore); return CallPt; } Index: test/CodeGen/AArch64/machine-outliner-regsave.mir =================================================================== --- /dev/null +++ test/CodeGen/AArch64/machine-outliner-regsave.mir @@ -0,0 +1,112 @@ +# RUN: llc -mtriple=aarch64-apple-darwin -run-pass=prologepilog \ +# RUN: -run-pass=machine-outliner -verify-machineinstrs %s -o - | FileCheck %s +# Check that we save LR to a callee-saved register when possible. +# foo() should use a callee-saved register. However, bar() should not. +--- | + + define void @foo() #0 { + ret void + } + + define void @bar() #0 { + ret void + } + + attributes #0 = { minsize noinline noredzone "no-frame-pointer-elim"="true" } +... +--- +# Make sure that when we outline and a register is available, we +# use it to save + restore LR instead of SP. +# CHECK: name: foo +# CHECK-DAG: bb.0 +# CHECK-DAG: $x[[REG:[0-9]+]] = ORRXrs $xzr, $lr, 0 +# CHECK-NEXT: BL +# CHECK-NEXT: $lr = ORRXrs $xzr, $x[[REG]], 0 +# CHECK-DAG: bb.1 +# CHECK-DAG: $x[[REG]] = ORRXrs $xzr, $lr, 0 +# CHECK-NEXT: BL +# CHECK-NEXT: $lr = ORRXrs $xzr, $x[[REG]], 0 +# CHECK-DAG: bb.2 +# CHECK-DAG: $x[[REG]] = ORRXrs $xzr, $lr, 0 +# CHECK-NEXT: BL +# CHECK-NEXT: $lr = ORRXrs $xzr, $x[[REG]], 0 +name: foo +tracksRegLiveness: true +fixedStack: +body: | + bb.0: + liveins: $lr, $w9 + $x25 = ORRXri $xzr, 1 + $w9 = ORRWri $wzr, 1 + $w9 = ORRWri $wzr, 1 + $w9 = ORRWri $wzr, 1 + $w9 = ORRWri $wzr, 1 + $w9 = ORRWri $wzr, 1 + $w9 = ORRWri $wzr, 2 + bb.1: + liveins: $lr, $w9 + $w9 = ORRWri $wzr, 1 + $w9 = ORRWri $wzr, 1 + $w9 = ORRWri $wzr, 1 + $w9 = ORRWri $wzr, 1 + $w9 = ORRWri $wzr, 1 + $w9 = ORRWri $wzr, 2 + bb.2: + liveins: $lr, $w9 + $w9 = ORRWri $wzr, 1 + $w9 = ORRWri $wzr, 1 + $w9 = ORRWri $wzr, 1 + $w9 = ORRWri $wzr, 1 + $w9 = ORRWri $wzr, 1 + $w9 = ORRWri $wzr, 2 + RET undef $lr + +... +--- +# Convoluted case that shows that we'll still save to the stack when there are +# no approprate registers available. +# The live-in lists do not contain x16 or x17 since including them would cause +# nothing to be outlined. +# They also deliberately don't contain x18 to show that on Darwin we won't store +# to that. +# CHECK-LABEL: name: bar +# CHECK: early-clobber $sp = STRXpre $lr, $sp, -16 +# CHECK-NEXT: BL +# CHECK-DAG: early-clobber $sp, $lr = LDRXpost $sp, 16 +# CHECK: early-clobber $sp = STRXpre $lr, $sp, -16 +# CHECK-NEXT: BL +# CHECK-DAG: early-clobber $sp, $lr = LDRXpost $sp, 16 +# CHECK: early-clobber $sp = STRXpre $lr, $sp, -16 +# CHECK-NEXT: BL +# CHECK-NEXT: early-clobber $sp, $lr = LDRXpost $sp, 16 +name: bar +tracksRegLiveness: true +body: | + bb.0: + liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28 + $w10 = ORRWri $wzr, 1 + $w10 = ORRWri $wzr, 1 + $w10 = ORRWri $wzr, 1 + $w10 = ORRWri $wzr, 1 + $w10 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 2 + bb.1: + liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28 + $w10 = ORRWri $wzr, 1 + $w10 = ORRWri $wzr, 1 + $w10 = ORRWri $wzr, 1 + $w10 = ORRWri $wzr, 1 + $w10 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 2 + bb.2: + liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28 + $w10 = ORRWri $wzr, 1 + $w10 = ORRWri $wzr, 1 + $w10 = ORRWri $wzr, 1 + $w10 = ORRWri $wzr, 1 + $w10 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 2 + bb.3: + liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28 + RET undef $lr + Index: test/CodeGen/AArch64/machine-outliner.ll =================================================================== --- test/CodeGen/AArch64/machine-outliner.ll +++ test/CodeGen/AArch64/machine-outliner.ll @@ -82,17 +82,17 @@ ; CHECK: .p2align 2 ; CHECK-NEXT: [[OUTLINED]]: ; CHECK: orr w8, wzr, #0x1 -; CHECK-NEXT: str w8, [sp, #44] +; CHECK-NEXT: str w8, [sp, #28] ; CHECK-NEXT: orr w8, wzr, #0x2 -; CHECK-NEXT: str w8, [sp, #40] +; CHECK-NEXT: str w8, [sp, #24] ; CHECK-NEXT: orr w8, wzr, #0x3 -; CHECK-NEXT: str w8, [sp, #36] +; CHECK-NEXT: str w8, [sp, #20] ; CHECK-NEXT: orr w8, wzr, #0x4 -; CHECK-NEXT: str w8, [sp, #32] +; CHECK-NEXT: str w8, [sp, #16] ; CHECK-NEXT: mov w8, #5 -; CHECK-NEXT: str w8, [sp, #28] +; CHECK-NEXT: str w8, [sp, #12] ; CHECK-NEXT: orr w8, wzr, #0x6 -; CHECK-NEXT: str w8, [sp, #24] +; CHECK-NEXT: str w8, [sp, #8] ; CHECK-NEXT: ret attributes #0 = { noredzone "target-cpu"="cyclone" } Index: test/CodeGen/AArch64/machine-outliner.mir =================================================================== --- test/CodeGen/AArch64/machine-outliner.mir +++ test/CodeGen/AArch64/machine-outliner.mir @@ -28,19 +28,19 @@ # CHECK-LABEL: name: main # CHECK: BL @OUTLINED_FUNCTION_[[F0:[0-9]+]] -# CHECK-NEXT: early-clobber $sp, $lr = LDRXpost $sp, 16 +# CHECK-NEXT: $lr = ORRXrs $xzr, $x[[REG:[0-9]+]], 0 # CHECK-NEXT: $x16 = ADDXri $sp, 48, 0 # CHECK-NEXT: STRHHroW $w16, $x9, $w30, 1, 1 # CHECK-NEXT: $lr = ORRXri $xzr, 1 # CHECK: BL @OUTLINED_FUNCTION_[[F0]] -# CHECK-NEXT: early-clobber $sp, $lr = LDRXpost $sp, 16 +# CHECK-NEXT: $lr = ORRXrs $xzr, $x[[REG]], 0 # CHECK-NEXT: $x16 = ADDXri $sp, 48, 0 # CHECK-NEXT: STRHHroW $w16, $x9, $w30, 1, 1 # CHECK-NEXT: $lr = ORRXri $xzr, 1 # CHECK: BL @OUTLINED_FUNCTION_[[F0]] -# CHECK-NEXT: early-clobber $sp, $lr = LDRXpost $sp, 16 +# CHECK-NEXT: $lr = ORRXrs $xzr, $x[[REG]], 0 # CHECK-NEXT: $x16 = ADDXri $sp, 48, 0 # CHECK-NEXT: STRHHroW $w16, $x9, $w30, 1, 1 # CHECK-NEXT: $lr = ORRXri $xzr, 1