Index: llvm/lib/Target/AArch64/AArch64FrameLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -1049,38 +1049,6 @@ } } -static void adaptForLdStOpt(MachineBasicBlock &MBB, - MachineBasicBlock::iterator FirstSPPopI, - MachineBasicBlock::iterator LastPopI) { - // Sometimes (when we restore in the same order as we save), we can end up - // with code like this: - // - // ldp x26, x25, [sp] - // ldp x24, x23, [sp, #16] - // ldp x22, x21, [sp, #32] - // ldp x20, x19, [sp, #48] - // add sp, sp, #64 - // - // In this case, it is always better to put the first ldp at the end, so - // that the load-store optimizer can run and merge the ldp and the add into - // a post-index ldp. - // If we managed to grab the first pop instruction, move it to the end. - if (ReverseCSRRestoreSeq) - MBB.splice(FirstSPPopI, &MBB, LastPopI); - // We should end up with something like this now: - // - // ldp x24, x23, [sp, #16] - // ldp x22, x21, [sp, #32] - // ldp x20, x19, [sp, #48] - // ldp x26, x25, [sp] - // add sp, sp, #64 - // - // and the load-store optimizer can merge the last two instructions into: - // - // ldp x26, x25, [sp], #64 - // -} - static bool isTargetWindows(const MachineFunction &MF) { return MF.getSubtarget().isTargetWindows(); } @@ -1911,18 +1879,12 @@ if (NoCalleeSaveRestore) StackRestoreBytes += AfterCSRPopSize; - // If we were able to combine the local stack pop with the argument pop, - // then we're done. - bool Done = NoCalleeSaveRestore || AfterCSRPopSize == 0; - - // If we're done after this, make sure to help the load store optimizer. - if (Done) - adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI); - emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, StackOffset::getFixed(StackRestoreBytes), TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); - if (Done) { + // If we were able to combine the local stack pop with the argument pop, + // then we're done. + if (NoCalleeSaveRestore || AfterCSRPopSize == 0) { if (HasWinCFI) { BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd)) @@ -1966,8 +1928,6 @@ FirstSPPopI = Prev; } - adaptForLdStOpt(MBB, FirstSPPopI, LastPopI); - emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP, StackOffset::getFixed(AfterCSRPopSize), TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); @@ -2624,7 +2584,7 @@ } bool AArch64FrameLowering::restoreCalleeSavedRegisters( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MutableArrayRef CSI, const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); @@ -2632,14 +2592,14 @@ SmallVector RegPairs; bool NeedsWinCFI = needsWinCFI(MF); - if (MI != MBB.end()) - DL = MI->getDebugLoc(); + if (MBBI != MBB.end()) + DL = MBBI->getDebugLoc(); bool NeedShadowCallStackProlog = false; computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, NeedShadowCallStackProlog, hasFP(MF)); - auto EmitMI = [&](const RegPairInfo &RPI) { + auto EmitMI = [&](const RegPairInfo &RPI) -> MachineBasicBlock::iterator { unsigned Reg1 = RPI.Reg1; unsigned Reg2 = RPI.Reg2; @@ -2696,7 +2656,7 @@ std::swap(Reg1, Reg2); std::swap(FrameIdxReg1, FrameIdxReg2); } - MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc)); + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(LdrOpc)); if (RPI.isPaired()) { MIB.addReg(Reg2, getDefRegState(true)); MIB.addMemOperand(MF.getMachineMemOperand( @@ -2713,6 +2673,7 @@ MachineMemOperand::MOLoad, Size, Alignment)); if (NeedsWinCFI) InsertSEH(MIB, TII, MachineInstr::FrameDestroy); + return MIB->getIterator(); }; // SVE objects are always restored in reverse order. @@ -2720,26 +2681,38 @@ if (RPI.isScalable()) EmitMI(RPI); - if (ReverseCSRRestoreSeq) { - for (const RegPairInfo &RPI : reverse(RegPairs)) - if (!RPI.isScalable()) - EmitMI(RPI); - } else if (homogeneousPrologEpilog(MF, &MBB)) { - auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Epilog)) + if (homogeneousPrologEpilog(MF, &MBB)) { + auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog)) .setMIFlag(MachineInstr::FrameDestroy); for (auto &RPI : RegPairs) { MIB.addReg(RPI.Reg1, RegState::Define); MIB.addReg(RPI.Reg2, RegState::Define); } return true; - } else - for (const RegPairInfo &RPI : RegPairs) - if (!RPI.isScalable()) - EmitMI(RPI); + } + + if (ReverseCSRRestoreSeq) { + MachineBasicBlock::iterator First = MBB.end(); + for (const RegPairInfo &RPI : reverse(RegPairs)) { + if (RPI.isScalable()) + continue; + MachineBasicBlock::iterator It = EmitMI(RPI); + if (First == MBB.end()) + First = It; + } + if (First != MBB.end()) + MBB.splice(MBBI, &MBB, First); + } else { + for (const RegPairInfo &RPI : RegPairs) { + if (RPI.isScalable()) + continue; + (void)EmitMI(RPI); + } + } if (NeedShadowCallStackProlog) { // Shadow call stack epilog: ldr x30, [x18, #-8]! - BuildMI(MBB, MI, DL, TII.get(AArch64::LDRXpre)) + BuildMI(MBB, MBBI, DL, TII.get(AArch64::LDRXpre)) .addReg(AArch64::X18, RegState::Define) .addReg(AArch64::LR, RegState::Define) .addReg(AArch64::X18) Index: llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir =================================================================== --- llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir +++ llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir @@ -1,5 +1,5 @@ -# RUN: llc -run-pass=prologepilog -reverse-csr-restore-seq -o - -mtriple=aarch64-- %s | FileCheck %s --check-prefixes=CHECK,BEFORELDSTOPT -# RUN: llc -start-before=prologepilog -stop-after=aarch64-ldst-opt -reverse-csr-restore-seq -o - -mtriple=aarch64-- %s | FileCheck %s --check-prefixes=CHECK,AFTERLDSTOPT +# RUN: llc -run-pass=prologepilog -reverse-csr-restore-seq -o - -mtriple=aarch64-- %s | FileCheck %s --check-prefixes=CHECK +# RUN: llc -start-before=prologepilog -stop-after=aarch64-ldst-opt -reverse-csr-restore-seq -o - -mtriple=aarch64-- %s | FileCheck %s --check-prefixes=CHECK # --- | @@ -31,13 +31,9 @@ ; CHECK-NEXT: $x22, $x21 = frame-destroy LDPXi $sp, 4 ; CHECK-NEXT: $x20, $x19 = frame-destroy LDPXi $sp, 6 - ; Before running the load-store optimizer, we emit a ldp and an add. - ; BEFORELDSTOPT-NEXT: $x26, $x25 = frame-destroy LDPXi $sp, 0 - ; BEFORELDSTOPT-NEXT: $sp = frame-destroy ADDXri $sp, 64, 0 - - ; We want to make sure that after running the load-store optimizer, the ldp - ; and the add get merged into a post-index ldp. - ; AFTERLDSTOPT-NEXT: early-clobber $sp, $x26, $x25 = frame-destroy LDPXpost $sp, 8 + ; The ldp and the stack increment get merged even before + ; the load-store optimizer. + ; CHECK-NEXT: early-clobber $sp, $x26, $x25 = frame-destroy LDPXpost $sp, 8 RET_ReallyLR ... @@ -66,10 +62,12 @@ ; the local stack size. This results in rewriting the offsets for all the ; save/restores and forbids us to merge the stack adjustment and the last pop. ; In this case, there is no point of moving the first CSR pair at the end. - ; CHECK: $x26, $x25 = frame-destroy LDPXi $sp, 2 - ; CHECK-NEXT: $x24, $x23 = frame-destroy LDPXi $sp, 4 + ; We do it anyway, as it's a small price to pay for the resulting + ; simplification in the epilogue emission code. + ; CHECK: $x24, $x23 = frame-destroy LDPXi $sp, 4 ; CHECK-NEXT: $x22, $x21 = frame-destroy LDPXi $sp, 6 ; CHECK-NEXT: $x20, $x19 = frame-destroy LDPXi $sp, 8 + ; CHECK-NEXT: $x26, $x25 = frame-destroy LDPXi $sp, 2 ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 80, 0 RET_ReallyLR ... @@ -98,9 +96,6 @@ bb.1: ; CHECK: $x21, $x20 = frame-destroy LDPXi $sp, 2 - ; BEFORELDSTOPT-NEXT: $lr = frame-destroy LDRXui $sp, 0 - ; BEFORELDSTOPT-NEXT: $sp = frame-destroy ADDXri $sp, 32, 0 - - ; AFTERLDSTOPT-NEXT: early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 32 + ; CHECK-NEXT: early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 32 RET_ReallyLR ...