Index: lib/Target/PowerPC/PPCFrameLowering.cpp =================================================================== --- lib/Target/PowerPC/PPCFrameLowering.cpp +++ lib/Target/PowerPC/PPCFrameLowering.cpp @@ -823,6 +823,44 @@ assert((isPPC64 || !MustSaveCR) && "Prologue CR saving supported only in 64-bit mode"); + // Check if we can move the stack update instruction (stdu) down the prologue + // past the callee saves. Hopefully this will avoid the situation where the + // saves are waiting for the update on the store with update to complete. + MachineBasicBlock::iterator StackUpdateLoc = MBBI; + bool MovingStackUpdateDown = false; + // This optimization has a number of guards. At this point we are being very + // cautious and we do not try to do this when we have a fast call or + // we are using PIC base or we are using a frame pointer or a base pointer. + // It would be possible to turn on this optimization under these conditions + // as well but it would require further modifications to the prologue and + // epilogue. For example, if we want to turn on this optimization for + // functions that use frame pointers we would have to take into consideration + // the fact that spills to the stack may be using r30 instead of r1. + // If the frame index requires scavenging there is the possibility that we may + // require a spill in the prologue in which case it is unsafe to move the + // stack pointer update. + // Aside from that we need to have a non-zero frame and we need to have a + // non-large frame size. Notice that we did not use !isLargeFrame but we used + // isInt<16>(FrameSize) instead. This is important because this guard has to + // be identical to the one in the epilogue and in the epilogue the variable + // is defined as bool isLargeFrame = !isInt<16>(FrameSize); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + bool RequiresScavenging = TRI->requiresFrameIndexScavenging(MF); + if (FrameSize && !FI->hasFastCall() && !FI->usesPICBase() && !HasFP && + !HasBP && isInt<16>(FrameSize) && !RequiresScavenging) { + const std::vector &Info = MFI.getCalleeSavedInfo(); + for (CalleeSavedInfo CSI : Info) { + int FrIdx = CSI.getFrameIdx(); + if (FrIdx < 0) { + if (MFI.isFixedObjectIndex(FrIdx) && MFI.getObjectOffset(FrIdx) < 0) { + MFI.setObjectOffset(FrIdx, MFI.getObjectOffset(FrIdx) + NegFrameSize); + StackUpdateLoc++; + MovingStackUpdateDown = true; + } + } + } + } + // If we need to spill the CR and the LR but we don't have two separate // registers available, we must spill them one at a time if (MustSaveCR && SingleScratchReg && MustSaveLR) { @@ -886,7 +924,7 @@ } if (MustSaveLR) - BuildMI(MBB, MBBI, dl, StoreInst) + BuildMI(MBB, StackUpdateLoc, dl, StoreInst) .addReg(ScratchReg, getKillRegState(true)) .addImm(LROffset) .addReg(SPReg); @@ -954,7 +992,7 @@ HasSTUX = true; } else if (!isLargeFrame) { - BuildMI(MBB, MBBI, dl, StoreUpdtInst, SPReg) + BuildMI(MBB, StackUpdateLoc, dl, StoreUpdtInst, SPReg) .addReg(SPReg) .addImm(NegFrameSize) .addReg(SPReg); @@ -1194,6 +1232,12 @@ } int Offset = MFI.getObjectOffset(CSI[I].getFrameIdx()); + // We have changed the object offset above but we do not want to change + // the actual offsets in the CFI instruction so we have to undo the + // offset change here. + if (MovingStackUpdateDown) + Offset -= NegFrameSize; + unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset( nullptr, MRI->getDwarfRegNum(Reg, true), Offset)); BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) @@ -1339,6 +1383,25 @@ unsigned RBReg = SPReg; unsigned SPAdd = 0; + // Check if we can move the stack update instruction up the epilogue + // past the callee saves. This will allow the move to LR instruction + // to be executed before the restores of the callee saves which means + // that the callee saves can hide the latency from the MTLR instrcution. + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + bool RequiresScavenging = TRI->requiresFrameIndexScavenging(MF); + MachineBasicBlock::iterator StackUpdateLoc = MBBI; + if (FrameSize && !FI->hasFastCall() && !FI->usesPICBase() && !HasFP && + !HasBP && !isLargeFrame && !RequiresScavenging) { + const std::vector & Info = MFI.getCalleeSavedInfo(); + for (CalleeSavedInfo CSI : Info) { + int FrIdx = CSI.getFrameIdx(); + if (FrIdx < 0) { + if (MFI.isFixedObjectIndex(FrIdx) && MFI.getObjectOffset(FrIdx) < 0) + StackUpdateLoc--; + } + } + } + if (FrameSize) { // In the prologue, the loaded (or persistent) stack pointer value is // offset by the STDU/STDUX/STWU/STWUX instruction. For targets with red @@ -1368,7 +1431,7 @@ } } else if (!isLargeFrame && !HasBP && !MFI.hasVarSizedObjects()) { if (HasRedZone) { - BuildMI(MBB, MBBI, dl, AddImmInst, SPReg) + BuildMI(MBB, StackUpdateLoc, dl, AddImmInst, SPReg) .addReg(SPReg) .addImm(FrameSize); } else { @@ -1392,7 +1455,7 @@ .addReg(FPReg); RBReg = FPReg; } - BuildMI(MBB, MBBI, dl, LoadInst, RBReg) + BuildMI(MBB, StackUpdateLoc, dl, LoadInst, RBReg) .addImm(0) .addReg(SPReg); } @@ -1425,7 +1488,7 @@ // a base register anyway, because it may happen to be R0. bool LoadedLR = false; if (MustSaveLR && RBReg == SPReg && isInt<16>(LROffset+SPAdd)) { - BuildMI(MBB, MBBI, dl, LoadInst, ScratchReg) + BuildMI(MBB, StackUpdateLoc, dl, LoadInst, ScratchReg) .addImm(LROffset+SPAdd) .addReg(RBReg); LoadedLR = true; @@ -1497,7 +1560,7 @@ .addReg(TempReg, getKillRegState(i == e-1)); if (MustSaveLR) - BuildMI(MBB, MBBI, dl, MTLRInst).addReg(ScratchReg); + BuildMI(MBB, StackUpdateLoc, dl, MTLRInst).addReg(ScratchReg); // Callee pop calling convention. Pop parameter/linkage area. Used for tail // call optimization Index: lib/Target/PowerPC/PPCRegisterInfo.h =================================================================== --- lib/Target/PowerPC/PPCRegisterInfo.h +++ lib/Target/PowerPC/PPCRegisterInfo.h @@ -90,9 +90,7 @@ return true; } - bool requiresFrameIndexScavenging(const MachineFunction &MF) const override { - return true; - } + bool requiresFrameIndexScavenging(const MachineFunction &MF) const override; bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override { return true; Index: lib/Target/PowerPC/PPCRegisterInfo.cpp =================================================================== --- lib/Target/PowerPC/PPCRegisterInfo.cpp +++ lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -297,6 +297,58 @@ } } +bool PPCRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const { + const PPCSubtarget &Subtarget = MF.getSubtarget(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + const std::vector &Info = MFI.getCalleeSavedInfo(); + bool RequiresScavenging = false; + + // If the saved info is invalid we have to default to true for safety. + if (!MFI.isCalleeSavedInfoValid()) + return true; + + // The saved info is valid so it can be traversed. + // Checking for registers that need saving that do not have load or store + // forms where the address offset is an immediate. + for (unsigned i=0; i= 0) continue; + + if (PPC::GPRCRegClass.contains(Reg) || + PPC::GPRC_NOR0RegClass.contains(Reg) || + PPC::G8RC_NOX0RegClass.contains(Reg) || + PPC::F8RCRegClass.contains(Reg) || + PPC::F4RCRegClass.contains(Reg) || + PPC::CRRCRegClass.contains(Reg) || + PPC::CRBITRCRegClass.contains(Reg)) { + // Do nothing here. + // All of these registers should be stored with a store that + // supports an immediate as part of the address. + } else if (PPC::VRRCRegClass.contains(Reg) || + PPC::VRSAVERCRegClass.contains(Reg) || + PPC::QFRCRegClass.contains(Reg) || + PPC::QSRCRegClass.contains(Reg) || + PPC::QBRCRegClass.contains(Reg) || + PPC::SPILLTOVSRRCRegClass.contains(Reg)) { + RequiresScavenging = true; + } else if (PPC::VSRCRegClass.contains(Reg) || + PPC::VSFRCRegClass.contains(Reg) || + PPC::VSSRCRegClass.contains(Reg)) { + // The P9 hardware has immeditate forms of these loads and stores + // however, earlier versions of the hardware do not. + if (!Subtarget.hasP9Vector()) + RequiresScavenging = true; + } + else { + llvm_unreachable("Unknown regclass!"); + } + } + return RequiresScavenging; +} + unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { const PPCFrameLowering *TFI = getFrameLowering(MF); Index: test/CodeGen/PowerPC/MCSE-caller-preserved-reg.ll =================================================================== --- test/CodeGen/PowerPC/MCSE-caller-preserved-reg.ll +++ test/CodeGen/PowerPC/MCSE-caller-preserved-reg.ll @@ -15,12 +15,12 @@ define noalias i8* @_ZN2CC3funEv(%class.CC* %this) { ; CHECK-LABEL: _ZN2CC3funEv: ; CHECK: mflr 0 -; CHECK-NEXT: std 0, 16(1) -; CHECK-NEXT: stdu 1, -48(1) ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: .cfi_offset lr, 16 ; CHECK-NEXT: .cfi_offset r30, -16 -; CHECK-NEXT: std 30, 32(1) +; CHECK-NEXT: std 30, -16(1) +; CHECK-NEXT: std 0, 16(1) +; CHECK-NEXT: stdu 1, -48(1) ; CHECK-NEXT: mr 30, 3 ; CHECK-NEXT: ld 12, 0(30) ; CHECK-NEXT: std 2, 24(1) @@ -38,11 +38,11 @@ ; CHECK-NEXT: mr 3, 30 ; CHECK-NEXT: bl _ZN2CC3barEPi ; CHECK-NEXT: nop -; CHECK: ld 30, 32(1) -; CHECK-NEXT: li 3, 0 +; CHECK: li 3, 0 ; CHECK-NEXT: addi 1, 1, 48 ; CHECK-NEXT: ld 0, 16(1) ; CHECK-NEXT: mtlr 0 +; CHECK: ld 30, -16(1) ; CHECK-NEXT: blr entry: %foo = getelementptr inbounds %class.CC, %class.CC* %this, i64 0, i32 0, i32 0 Index: test/CodeGen/PowerPC/ppc-shrink-wrapping.ll =================================================================== --- test/CodeGen/PowerPC/ppc-shrink-wrapping.ll +++ test/CodeGen/PowerPC/ppc-shrink-wrapping.ll @@ -110,7 +110,7 @@ ; ; Epilogue code. ; CHECK: mtlr {{[0-9]+}} -; CHECK-NEXT: blr +; CHECK: blr ; ; ENABLE: .[[ELSE_LABEL]]: # %if.else ; Shift second argument by one and store into returned register. @@ -171,7 +171,7 @@ ; Next BB ; CHECK: %for.end ; CHECK: mtlr {{[0-9]+}} -; CHECK-NEXT: blr +; CHECK: blr define i32 @freqSaveAndRestoreOutsideLoop2(i32 %cond) { entry: br label %for.preheader @@ -209,9 +209,9 @@ ; Make sure we save the link register ; CHECK: mflr {{[0-9]+}} ; -; DISABLE: cmplwi 0, 3, 0 -; DISABLE-NEXT: std +; DISABLE: std ; DISABLE-NEXT: std +; DISABLE: cmplwi 0, 3, 0 ; DISABLE-NEXT: beq 0, .[[ELSE_LABEL:LBB[0-9_]+]] ; ; Loop preheader @@ -240,7 +240,7 @@ ; DISABLE: .[[EPILOG_BB]]: # %if.end ; Epilog code ; CHECK: mtlr {{[0-9]+}} -; CHECK-NEXT: blr +; CHECK: blr ; ; ENABLE: .[[ELSE_LABEL]]: # %if.else ; Shift second argument by one and store into returned register. @@ -291,9 +291,9 @@ ; Make sure we save the link register ; CHECK: mflr {{[0-9]+}} ; -; DISABLE: cmplwi 0, 3, 0 -; DISABLE-NEXT: std +; DISABLE: std ; DISABLE-NEXT: std +; DISABLE: cmplwi 0, 3, 0 ; DISABLE-NEXT: beq 0, .[[ELSE_LABEL:LBB[0-9_]+]] ; ; CHECK: bl somethingElse @@ -322,7 +322,7 @@ ; ; Epilogue code. ; CHECK: mtlr {{[0-9]+}} -; CHECK-NEXT: blr +; CHECK: blr ; ; ENABLE: .[[ELSE_LABEL]]: # %if.else ; Shift second argument by one and store into returned register. Index: test/CodeGen/PowerPC/tls_get_addr_clobbers.ll =================================================================== --- test/CodeGen/PowerPC/tls_get_addr_clobbers.ll +++ test/CodeGen/PowerPC/tls_get_addr_clobbers.ll @@ -6,7 +6,7 @@ entry: ; CHECK-LABEL: test_foo: -; CHECK: stdu 1, {{-?[0-9]+}}(1) +; CHECK-DAG: stdu 1, {{-?[0-9]+}}(1) ; CHECK-DAG: mr [[BACKUP_3:[0-9]+]], 3 ; CHECK-DAG: mr [[BACKUP_4:[0-9]+]], 4 ; CHECK-DAG: mr [[BACKUP_5:[0-9]+]], 5 @@ -15,14 +15,14 @@ ; CHECK-DAG: mr [[BACKUP_8:[0-9]+]], 8 ; CHECK-DAG: mr [[BACKUP_9:[0-9]+]], 9 ; CHECK-DAG: mr [[BACKUP_10:[0-9]+]], 10 -; CHECK-DAG: std [[BACKUP_3]], {{[0-9]+}}(1) -; CHECK-DAG: std [[BACKUP_4]], {{[0-9]+}}(1) -; CHECK-DAG: std [[BACKUP_5]], {{[0-9]+}}(1) -; CHECK-DAG: std [[BACKUP_6]], {{[0-9]+}}(1) -; CHECK-DAG: std [[BACKUP_7]], {{[0-9]+}}(1) -; CHECK-DAG: std [[BACKUP_8]], {{[0-9]+}}(1) -; CHECK-DAG: std [[BACKUP_9]], {{[0-9]+}}(1) -; CHECK-DAG: std [[BACKUP_10]], {{[0-9]+}}(1) +; CHECK-DAG: std [[BACKUP_3]], {{-?[0-9]+}}(1) +; CHECK-DAG: std [[BACKUP_4]], {{-?[0-9]+}}(1) +; CHECK-DAG: std [[BACKUP_5]], {{-?[0-9]+}}(1) +; CHECK-DAG: std [[BACKUP_6]], {{-?[0-9]+}}(1) +; CHECK-DAG: std [[BACKUP_7]], {{-?[0-9]+}}(1) +; CHECK-DAG: std [[BACKUP_8]], {{-?[0-9]+}}(1) +; CHECK-DAG: std [[BACKUP_9]], {{-?[0-9]+}}(1) +; CHECK-DAG: std [[BACKUP_10]], {{-?[0-9]+}}(1) ; CHECK: bl __tls_get_addr ; CHECK-DAG: stw 3, 0([[BACKUP_3]]) ; CHECK-DAG: stw 3, 0([[BACKUP_4]])