Index: llvm/lib/Target/AArch64/AArch64FrameLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -882,15 +882,6 @@ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc, bool NeedsWinCFI, bool *HasWinCFI, bool InProlog = true) { - // Ignore instructions that do not operate on SP, i.e. shadow call stack - // instructions and associated CFI instruction. - while (MBBI->getOpcode() == AArch64::STRXpost || - MBBI->getOpcode() == AArch64::LDRXpre || - MBBI->getOpcode() == AArch64::CFI_INSTRUCTION) { - if (MBBI->getOpcode() != AArch64::CFI_INSTRUCTION) - assert(MBBI->getOperand(0).getReg() != AArch64::SP); - ++MBBI; - } unsigned NewOpc; switch (MBBI->getOpcode()) { default: @@ -998,16 +989,6 @@ return; unsigned Opc = MI.getOpcode(); - - // Ignore instructions that do not operate on SP, i.e. shadow call stack - // instructions and associated CFI instruction. - if (Opc == AArch64::STRXpost || Opc == AArch64::LDRXpre || - Opc == AArch64::CFI_INSTRUCTION) { - if (Opc != AArch64::CFI_INSTRUCTION) - assert(MI.getOperand(0).getReg() != AArch64::SP); - return; - } - unsigned Scale; switch (Opc) { case AArch64::STPXi: @@ -1049,38 +1030,6 @@ } } -static void adaptForLdStOpt(MachineBasicBlock &MBB, - MachineBasicBlock::iterator FirstSPPopI, - MachineBasicBlock::iterator LastPopI) { - // Sometimes (when we restore in the same order as we save), we can end up - // with code like this: - // - // ldp x26, x25, [sp] - // ldp x24, x23, [sp, #16] - // ldp x22, x21, [sp, #32] - // ldp x20, x19, [sp, #48] - // add sp, sp, #64 - // - // In this case, it is always better to put the first ldp at the end, so - // that the load-store optimizer can run and merge the ldp and the add into - // a post-index ldp. - // If we managed to grab the first pop instruction, move it to the end. - if (ReverseCSRRestoreSeq) - MBB.splice(FirstSPPopI, &MBB, LastPopI); - // We should end up with something like this now: - // - // ldp x24, x23, [sp, #16] - // ldp x22, x21, [sp, #32] - // ldp x20, x19, [sp, #48] - // ldp x26, x25, [sp] - // add sp, sp, #64 - // - // and the load-store optimizer can merge the last two instructions into: - // - // ldp x26, x25, [sp], #64 - // -} - static bool isTargetWindows(const MachineFunction &MF) { return MF.getSubtarget().isTargetWindows(); } @@ -1099,6 +1048,72 @@ } } +static bool needsShadowCallStackPrologueEpilogue(MachineFunction &MF) { + if (!(llvm::any_of( + MF.getFrameInfo().getCalleeSavedInfo(), + [](const auto &Info) { return Info.getReg() == AArch64::LR; }) && + MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack))) + return false; + + if (!MF.getSubtarget().isXRegisterReserved(18)) + report_fatal_error("Must reserve x18 to use shadow call stack"); + + return true; +} + +static void emitShadowCallStackPrologue(const TargetInstrInfo &TII, + MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, bool NeedsWinCFI, + bool NeedsUnwindInfo) { + // Shadow call stack prolog: str x30, [x18], #8 + BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXpost)) + .addReg(AArch64::X18, RegState::Define) + .addReg(AArch64::LR) + .addReg(AArch64::X18) + .addImm(8) + .setMIFlag(MachineInstr::FrameSetup); + + // This instruction also makes x18 live-in to the entry block. + MBB.addLiveIn(AArch64::X18); + + if (NeedsWinCFI) + BuildMI(MBB, MBBI, DL, TII.get(AArch64::SEH_Nop)) + .setMIFlag(MachineInstr::FrameSetup); + + if (NeedsUnwindInfo) { + // Emit a CFI instruction that causes 8 to be subtracted from the value of + // x18 when unwinding past this frame. + static const char CFIInst[] = { + dwarf::DW_CFA_val_expression, + 18, // register + 2, // length + static_cast(unsigned(dwarf::DW_OP_breg18)), + static_cast(-8) & 0x7f, // addend (sleb128) + }; + unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createEscape( + nullptr, StringRef(CFIInst, sizeof(CFIInst)))); + BuildMI(MBB, MBBI, DL, TII.get(AArch64::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlag(MachineInstr::FrameSetup); + } +} + +static void emitShadowCallStackEpilogue(const TargetInstrInfo &TII, + MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL) { + // Shadow call stack epilog: ldr x30, [x18, #-8]! + BuildMI(MBB, MBBI, DL, TII.get(AArch64::LDRXpre)) + .addReg(AArch64::X18, RegState::Define) + .addReg(AArch64::LR, RegState::Define) + .addReg(AArch64::X18) + .addImm(-8) + .setMIFlag(MachineInstr::FrameDestroy); +} + void AArch64FrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.begin(); @@ -1127,6 +1142,10 @@ DebugLoc DL; const auto &MFnI = *MF.getInfo(); + if (needsShadowCallStackPrologueEpilogue(MF)) + emitShadowCallStackPrologue(*TII, MF, MBB, MBBI, DL, NeedsWinCFI, + MFnI.needsDwarfUnwindInfo()); + if (MFnI.shouldSignReturnAddress()) { unsigned PACI; @@ -1702,6 +1721,11 @@ IsFunclet = isFuncletReturnInstr(*MBBI); } + auto ShadowStackEpilogue = make_scope_exit([&]() { + if (needsShadowCallStackPrologueEpilogue(MF)) + emitShadowCallStackEpilogue(*TII, MF, MBB, MBB.getFirstTerminator(), DL); + }); + int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF) : MFI.getStackSize(); AArch64FunctionInfo *AFI = MF.getInfo(); @@ -1918,18 +1942,12 @@ if (NoCalleeSaveRestore) StackRestoreBytes += AfterCSRPopSize; - // If we were able to combine the local stack pop with the argument pop, - // then we're done. - bool Done = NoCalleeSaveRestore || AfterCSRPopSize == 0; - - // If we're done after this, make sure to help the load store optimizer. - if (Done) - adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI); - emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, StackOffset::getFixed(StackRestoreBytes), TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); - if (Done) { + // If we were able to combine the local stack pop with the argument pop, + // then we're done. + if (NoCalleeSaveRestore || AfterCSRPopSize == 0) { if (HasWinCFI) { BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd)) @@ -1961,21 +1979,8 @@ if (AfterCSRPopSize) { assert(AfterCSRPopSize > 0 && "attempting to reallocate arg stack that an " "interrupt may have clobbered"); - // Find an insertion point for the first ldp so that it goes before the - // shadow call stack epilog instruction. This ensures that the restore of - // lr from x18 is placed after the restore from sp. - auto FirstSPPopI = MBB.getFirstTerminator(); - while (FirstSPPopI != Begin) { - auto Prev = std::prev(FirstSPPopI); - if (Prev->getOpcode() != AArch64::LDRXpre || - Prev->getOperand(0).getReg() == AArch64::SP) - break; - FirstSPPopI = Prev; - } - - adaptForLdStOpt(MBB, FirstSPPopI, LastPopI); - emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP, + emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP, StackOffset::getFixed(AfterCSRPopSize), TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); } @@ -2277,7 +2282,7 @@ static void computeCalleeSaveRegisterPairs( MachineFunction &MF, ArrayRef CSI, const TargetRegisterInfo *TRI, SmallVectorImpl &RegPairs, - bool &NeedShadowCallStackProlog, bool NeedsFrameRecord) { + bool NeedsFrameRecord) { if (CSI.empty()) return; @@ -2356,15 +2361,6 @@ } } - // If either of the registers to be saved is the lr register, it means that - // we also need to save lr in the shadow call stack. - if ((RPI.Reg1 == AArch64::LR || RPI.Reg2 == AArch64::LR) && - MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack)) { - if (!MF.getSubtarget().isXRegisterReserved(18)) - report_fatal_error("Must reserve x18 to use shadow call stack"); - NeedShadowCallStackProlog = true; - } - // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI // list to come in sorted by frame index so that we can issue the store // pair instructions directly. Assert if we see anything otherwise. @@ -2483,45 +2479,9 @@ DebugLoc DL; SmallVector RegPairs; - bool NeedShadowCallStackProlog = false; - computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, - NeedShadowCallStackProlog, hasFP(MF)); - const MachineRegisterInfo &MRI = MF.getRegInfo(); - - if (NeedShadowCallStackProlog) { - // Shadow call stack prolog: str x30, [x18], #8 - BuildMI(MBB, MI, DL, TII.get(AArch64::STRXpost)) - .addReg(AArch64::X18, RegState::Define) - .addReg(AArch64::LR) - .addReg(AArch64::X18) - .addImm(8) - .setMIFlag(MachineInstr::FrameSetup); - - // This instruction also makes x18 live-in to the entry block. - MBB.addLiveIn(AArch64::X18); - - if (NeedsWinCFI) - BuildMI(MBB, MI, DL, TII.get(AArch64::SEH_Nop)) - .setMIFlag(MachineInstr::FrameSetup); - - if (MF.getInfo()->needsDwarfUnwindInfo()) { - // Emit a CFI instruction that causes 8 to be subtracted from the value of - // x18 when unwinding past this frame. - static const char CFIInst[] = { - dwarf::DW_CFA_val_expression, - 18, // register - 2, // length - static_cast(unsigned(dwarf::DW_OP_breg18)), - static_cast(-8) & 0x7f, // addend (sleb128) - }; - unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createEscape( - nullptr, StringRef(CFIInst, sizeof(CFIInst)))); - BuildMI(MBB, MI, DL, TII.get(AArch64::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlag(MachineInstr::FrameSetup); - } - } + computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF)); + const MachineRegisterInfo &MRI = MF.getRegInfo(); if (homogeneousPrologEpilog(MF)) { auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Prolog)) .setMIFlag(MachineInstr::FrameSetup); @@ -2631,7 +2591,7 @@ } bool AArch64FrameLowering::restoreCalleeSavedRegisters( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MutableArrayRef CSI, const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); @@ -2639,14 +2599,12 @@ SmallVector RegPairs; bool NeedsWinCFI = needsWinCFI(MF); - if (MI != MBB.end()) - DL = MI->getDebugLoc(); + if (MBBI != MBB.end()) + DL = MBBI->getDebugLoc(); - bool NeedShadowCallStackProlog = false; - computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, - NeedShadowCallStackProlog, hasFP(MF)); + computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF)); - auto EmitMI = [&](const RegPairInfo &RPI) { + auto EmitMI = [&](const RegPairInfo &RPI) -> MachineBasicBlock::iterator { unsigned Reg1 = RPI.Reg1; unsigned Reg2 = RPI.Reg2; @@ -2703,7 +2661,7 @@ std::swap(Reg1, Reg2); std::swap(FrameIdxReg1, FrameIdxReg2); } - MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc)); + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(LdrOpc)); if (RPI.isPaired()) { MIB.addReg(Reg2, getDefRegState(true)); MIB.addMemOperand(MF.getMachineMemOperand( @@ -2720,6 +2678,7 @@ MachineMemOperand::MOLoad, Size, Alignment)); if (NeedsWinCFI) InsertSEH(MIB, TII, MachineInstr::FrameDestroy); + return MIB->getIterator(); }; // SVE objects are always restored in reverse order. @@ -2727,31 +2686,33 @@ if (RPI.isScalable()) EmitMI(RPI); - if (ReverseCSRRestoreSeq) { - for (const RegPairInfo &RPI : reverse(RegPairs)) - if (!RPI.isScalable()) - EmitMI(RPI); - } else if (homogeneousPrologEpilog(MF, &MBB)) { - auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Epilog)) + if (homogeneousPrologEpilog(MF, &MBB)) { + auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog)) .setMIFlag(MachineInstr::FrameDestroy); for (auto &RPI : RegPairs) { MIB.addReg(RPI.Reg1, RegState::Define); MIB.addReg(RPI.Reg2, RegState::Define); } return true; - } else - for (const RegPairInfo &RPI : RegPairs) - if (!RPI.isScalable()) - EmitMI(RPI); - - if (NeedShadowCallStackProlog) { - // Shadow call stack epilog: ldr x30, [x18, #-8]! - BuildMI(MBB, MI, DL, TII.get(AArch64::LDRXpre)) - .addReg(AArch64::X18, RegState::Define) - .addReg(AArch64::LR, RegState::Define) - .addReg(AArch64::X18) - .addImm(-8) - .setMIFlag(MachineInstr::FrameDestroy); + } + + if (ReverseCSRRestoreSeq) { + MachineBasicBlock::iterator First = MBB.end(); + for (const RegPairInfo &RPI : reverse(RegPairs)) { + if (RPI.isScalable()) + continue; + MachineBasicBlock::iterator It = EmitMI(RPI); + if (First == MBB.end()) + First = It; + } + if (First != MBB.end()) + MBB.splice(MBBI, &MBB, First); + } else { + for (const RegPairInfo &RPI : RegPairs) { + if (RPI.isScalable()) + continue; + (void) EmitMI(RPI); + } } return true; Index: llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir =================================================================== --- llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir +++ llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir @@ -1,5 +1,5 @@ -# RUN: llc -run-pass=prologepilog -reverse-csr-restore-seq -o - -mtriple=aarch64-- %s | FileCheck %s --check-prefixes=CHECK,BEFORELDSTOPT -# RUN: llc -start-before=prologepilog -stop-after=aarch64-ldst-opt -reverse-csr-restore-seq -o - -mtriple=aarch64-- %s | FileCheck %s --check-prefixes=CHECK,AFTERLDSTOPT +# RUN: llc -run-pass=prologepilog -reverse-csr-restore-seq -o - -mtriple=aarch64-- %s | FileCheck %s --check-prefixes=CHECK +# RUN: llc -start-before=prologepilog -stop-after=aarch64-ldst-opt -reverse-csr-restore-seq -o - -mtriple=aarch64-- %s | FileCheck %s --check-prefixes=CHECK # --- | @@ -31,13 +31,9 @@ ; CHECK-NEXT: $x22, $x21 = frame-destroy LDPXi $sp, 4 ; CHECK-NEXT: $x20, $x19 = frame-destroy LDPXi $sp, 6 - ; Before running the load-store optimizer, we emit a ldp and an add. - ; BEFORELDSTOPT-NEXT: $x26, $x25 = frame-destroy LDPXi $sp, 0 - ; BEFORELDSTOPT-NEXT: $sp = frame-destroy ADDXri $sp, 64, 0 - - ; We want to make sure that after running the load-store optimizer, the ldp - ; and the add get merged into a post-index ldp. - ; AFTERLDSTOPT-NEXT: early-clobber $sp, $x26, $x25 = frame-destroy LDPXpost $sp, 8 + ; The ldp and the stack increment get merged even before + ; the load-store optimizer. + ; CHECK-NEXT: early-clobber $sp, $x26, $x25 = frame-destroy LDPXpost $sp, 8 RET_ReallyLR ... @@ -66,10 +62,12 @@ ; the local stack size. This results in rewriting the offsets for all the ; save/restores and forbids us to merge the stack adjustment and the last pop. ; In this case, there is no point of moving the first CSR pair at the end. - ; CHECK: $x26, $x25 = frame-destroy LDPXi $sp, 2 - ; CHECK-NEXT: $x24, $x23 = frame-destroy LDPXi $sp, 4 + ; We do it anyway, as it's a small price to pay for the resulting + ; simplification in the epilogue emission code. + ; CHECK: $x24, $x23 = frame-destroy LDPXi $sp, 4 ; CHECK-NEXT: $x22, $x21 = frame-destroy LDPXi $sp, 6 ; CHECK-NEXT: $x20, $x19 = frame-destroy LDPXi $sp, 8 + ; CHECK-NEXT: $x26, $x25 = frame-destroy LDPXi $sp, 2 ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 80, 0 RET_ReallyLR ... @@ -98,9 +96,6 @@ bb.1: ; CHECK: $x21, $x20 = frame-destroy LDPXi $sp, 2 - ; BEFORELDSTOPT-NEXT: $lr = frame-destroy LDRXui $sp, 0 - ; BEFORELDSTOPT-NEXT: $sp = frame-destroy ADDXri $sp, 32, 0 - - ; AFTERLDSTOPT-NEXT: early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 32 + ; CHECK-NEXT: early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 32 RET_ReallyLR ...